# Usage: python create_edges3.py parameter_for_net.txt
#
# Make it faster by spawning subprocesses.
#
# Make it smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first.
# So we don't need to load the big matrices, TPM.txt and binding.txt.
#
# 19 JAN 2017, hui, slcu

import sys, os, operator, itertools
import numpy as np
import scipy.stats as stat
from datetime import datetime

import rpy2.robjects as r
from rpy2.robjects.packages import importr
from rpy2.robjects import FloatVector

import warnings

import multiprocessing
import time

import json
from geneid2name import make_gene_name_AGI_map_dict
from param4net import make_global_param_dict, get_gene_name

EDGE_DIR = '../Data/history/edges/one_target'  # a directory storing all edge files, one for each target gene
GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
MAX_NUM_PROCESS = 20



####################################
DATA_SYMBOL         = '@'
TOP_N_TF            = 50
MIN_NUM_CONDITION   = 20
SIGNAL_INPUT_RATIO_TAU = 1.5
REMOVE_HORIZONTAL_STRIP_TAU = 0.01
REMOVE_VERTICAL_STRIP_TAU   = 0.01
MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30
####################################

def get_three_components_mixtools(y, x, cond_lst):

    # Remove NaNs or Infs
    warn_msg = ''
    sz = len(x)
    if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
        return None, None, None, None, None, None, None, None, None, 'IGNORE'

    index = np.isfinite(x) & np.isfinite(y)
    if sum(index) < sz:
        warn_msg = 'HAS_NAN_OR_INIFNITE'
        if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
            return None, None, None, None, None, None, None, None, None, 'IGNORE'
            
    xx = np.array(x[index])
    yy = np.array(y[index])
    cond_lst2 = np.array(cond_lst)
    cond_lst2 = cond_lst2[index]

    # Train the model
    mixtools =  importr('mixtools')
    try:
        result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100)
    except:
        return None, None, None, None, None, None, None, None, None, 'IGNORE'
    posterior = result[result.names.index('posterior')]
    posterior = np.array(posterior)
    l = np.argmax(posterior, axis=1) # class information
    idx1 = l == 0
    idx2 = l == 1
    idx3 = l == 2    
    warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0])

    return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg


def read_experiment_id(fname):
    ''' Read column names (RNA-seq ids) and row names (gene ids) from TPM.txt '''

    colid = []
    rowid = []

    f = open(fname)
    lines = f.readlines()
    f.close()

    head_line = lines[0].strip()
    lst = head_line.split()
    colid = lst[1:]
    rowid = []

    for line in lines[1:]:
        line = line.strip()
        lst = line.split()
        g = lst[0]
        rowid.append(g)
    
    return (colid, rowid)


def read_matrix_data(fname):
    ''' 
    fname - a file, first line is head, first column is row name.
    '''
    
    lineno = 0
    colid = []
    rowid = []
    d =  {}  # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
    d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
    d3 = {} # {gene1: [], gene2: [], ...}
    d4 = {} # {cond1:[], cond2:[], ...}

    f = open(fname)
    lines = f.readlines()
    f.close()

    head_line = lines[0].strip()
    lst = head_line.split()
    colid = lst[1:]

    for c in colid:
        d2[c] = {}
        d4[c] = []
    
    for line in lines[1:]:
        line = line.strip()
        lst = line.split()
        g = lst[0]
        rowid.append(g)
        d[g] = {}
        levels = lst[1:]
        if len(levels) != len(colid):
            print('Incomplete columns at row %s' % (g))
            sys.exit()
            
        d3[g] = []
        for i in range(len(colid)):
            c = colid[i]
            d[g][c]  = float(levels[i])
            d2[c][g] = float(levels[i])
            d3[g].append(float(levels[i]))
            d4[c].append(float(levels[i]))
        lineno += 1

    d_return = {}
    d_return['xy'] = d  # first gene, then condition
    d_return['yx'] = d2 # first condition, then gene
    d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
    d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
    d_return['nrow'] = lineno - 1
    d_return['ncol'] = len(colid)
    d_return['rowid'] = rowid
    d_return['colid'] = colid    

    d4_sorted = {}
    for k in d4:
        d4_sorted[k] = sorted(d4[k], reverse=True)
    d_return['yy_sorted'] = d4_sorted

    return d_return


def get_value(s, delimit):
    lst = s.split(delimit)
    return lst[1].strip()


def read_info_data(fname):
    ''' Read chip-seq data information '''

    if not os.path.exists(fname):
        print('%s not exists.' % (fname) )
        sys.exit()
        
    d = {'ID_LIST':[]}
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line == '' or line.startswith('#') or line.startswith('%'):
            continue
        if line.startswith(DATA_SYMBOL):
            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
            s = s.strip()
            if s in d:
                print('ID %s duplicate' % (s))
                sys.exit()
            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
            d['ID_LIST'].append(s)
        if line.startswith('DESCRIPTION:'):
            d[s]['DESCRIPTION'] = get_value(line, ':')
        elif line.startswith('PROTEN_NAME:'):
            d[s]['PROTEN_NAME'] = get_value(line, ':')
        elif line.startswith('PROTEIN_ID:'):
            d[s]['PROTEIN_ID'] = get_value(line, ':')
        elif line.startswith('DATA_NAME:'):
            d[s]['DATA_NAME'] = get_value(line, ':')                        
        elif line.startswith('DATA_FORMAT:'):
            d[s]['DATA_FORMAT'] = get_value(line, ':')
        elif line.startswith('LOCATION:'):
            d[s]['LOCATION'] = get_value(line, ':')
        elif line.startswith('NOTE:'):
            d[s]['NOTE'] = get_value(line, ':')

    return d


def get_gene_list(fname):
    result = []
    f = open(fname)
    for line in f:
        line = line.strip()
        lst = line.split()
        result.append(lst[0])
    f.close()
    return result


def get_related_condition(s, info_dict):
    lst = s.split(';')
    result = [] # a list of sample IDs
    result = info_dict['ID_LIST'] # TBD
    return result

def update_global_param_dict(glb_param_dict, info_dict):
    if glb_param_dict['RESEARCH_KEYWORDS'] == '':
        glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST']
    glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict)
    

def float_equal(x, y):
    return np.abs(x-y) < 0.001


def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
    ''' get gene expression for two genes.  Conditions in which two genes have zero TPM values are ignored. '''
    num_cond = len(cond_lst)
    elst1 = [None]*num_cond
    elst2 = [None]*num_cond
    clst  = [None]*num_cond
    d = expr_dict['xy']
    j = 0
    for i in range(num_cond):
        c = cond_lst[i]
        x = expr_dict['xy'][gene_id1][c]
        y = expr_dict['xy'][gene_id2][c]
        #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c))
        #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
        if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
            if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
                elst1[j] = np.log(x+1)
                elst2[j] = np.log(y+1)
            else:
                elst1[j] = x
                elst2[j] = y
            clst[j] = c
            j += 1
    return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] )


def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
    ''' 
    get gene expression for two genes.  Conditions in which two genes have zero TPM values are ignored. 
    In addition, vertical strip (as seen in their scatter plot) and horizontal strip are removed.
    '''
    num_cond = len(cond_lst)
    elst1 = [None]*num_cond
    elst2 = [None]*num_cond
    mark_cond = [True]*num_cond # indicate if a condition should be included
    clst  = []
    d = expr_dict['xy']

    for i in range(num_cond):
        c = cond_lst[i]
        x = expr_dict['xy'][gene_id1][c]
        y = expr_dict['xy'][gene_id2][c]
        if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
            if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
                elst1[i] = np.log(x+1)
                elst2[i] = np.log(y+1)
            else:
                elst1[i] = x
                elst2[i] = y
            if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip
                mark_cond[i] = False
            else:
                clst.append(c)
        else:
            mark_cond[i] = False

    a = np.array(elst1)
    a = a[mark_cond]
    b = np.array(elst2)
    b = b[mark_cond]
    return (a.astype(np.float64), b.astype(np.float64), clst)


#################### select condition stuff ###############

def common_elements(list1, list2):
     return sorted(list(set(list1).intersection(list2)))

def correlation_is_significant(r, p, glb_param_dict):
    return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF'])

def subset_list(lst, bool_index):
    if len(lst) != len(bool_index):
        print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index))
        sys.exit()

    n = len(lst)
    result = []
    for i in range(n):
        if bool_index[i] == True:
            result.append(lst[i])
    return result


def write_dict(d_lst, glb_param_dict, fname):
    ''' write a list of dictionary content to fname.  Each dictionary contains a target-TF edge. '''
    agi2name_dict = glb_param_dict['name_conversion_dict']
    curr_date = datetime.now().strftime('%Y%m%d_%H%M%S') # add date to the end of each line, for future reference or filtering
    fname = os.path.join(EDGE_DIR, fname)
    f = open(fname, 'w')
    for d in d_lst:
        if d['type'] == 'two-way':
            gene_id = d['target']
            head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
            gene_id = d['TF']
            head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
            d2 = d['significant']
            if 'all' in d2:
                s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.')
                s += '\t' + curr_date
                f.write(head + s + '\n')
            if 'mix' in d2:
                n = len(d2['mix']['score'])
                for i in range(n):
                    s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i])
                    s += '\t' + curr_date
                    f.write(head + s + '\n')
    f.close()


def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict):
    '''

    Check if target has relationship with each of TFs.
    
    tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST)

    Return a list of dictionaries. Each dictionary has the following format:  

        'type'  :'two-way'
        'target':''
        'TF'    :''
        'significant': {
            'all':          {'signal_set':[], score=2, chip_id:''}
            'pos_direction':{'signal_set':[], score:.0, chip_id:''}
            'neg_direction':{'signal_set':[], score:.1, chip_id:''}
            'user_defined': {'signal_set':[], score:.3, chip_id:''}
            'mix':          {'signal_set':[], score:[.7,-.5], chip_id:''}
        }

    '''
    
    result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair
    
    target_gene_id = target
    all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples.   TBD, can be glb_param_dict['USER_CONDITION_LIST']
    logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values

    if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything
        return None
    
    for tf_gene_id in sorted(tf_dict.keys()):  # y is a TF gene id

        chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000

        if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything
            continue

        # get gene expression profiles for target and TF.  If in a RNA-seq sample, both target and TF is 0, then this sample is ignored.
        target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize)

        r, p = stat.pearsonr(target_elst, tf_elst)

        d = {}
        d['target'] = target_gene_id
        d['TF'] = tf_gene_id
        d['type'] = 'two-way'
        d['significant'] = {}

        all_good = False
        if correlation_is_significant(r, p, glb_param_dict):
            d['significant']['all'] =  {}
            d['significant']['all']['signal_set'] =  clist # a list of sample IDs, returned by get_gene_expression3
            d['significant']['all']['score'] =  r
            d['significant']['all']['chip_id']  = chip_id            
            all_good = True
            
        user_cond_lst =  glb_param_dict['USER_CONDITION_LIST']
        if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []:
             target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize)
             
             r, p = stat.pearsonr(target_elst_user, tf_elst_user)
             if correlation_is_significant(r, p, glb_param_dict):
                d['significant']['user'] =  {}
                d['significant']['user']['signal_set'] = user_cond_lst
                d['significant']['user']['score'] = r
                d['significant']['user']['chip_id']  = chip_id

        # obsolete
        max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF']
        if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES':
            aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos')
            r_pos, p_pos = stat.pearsonr(aa, bb)
            if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION:
                d['significant']['pos'] =  {}
                d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos)
                d['significant']['pos']['score'] = r_pos
                d['significant']['pos']['chip_id']  = chip_id

        # obsolete
        if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES':
            aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg')
            r_neg, p_neg = stat.pearsonr(aa, bb)
            if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION:
                d['significant']['neg'] =  {}
                d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg)
                d['significant']['neg']['score'] = r_neg
                d['significant']['neg']['chip_id']  = chip_id

        K = int(glb_param_dict['NUMBER_OF_COMPONENTS'])
        if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results
            if K == 2: # for now consider two components
                #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst)))
                #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist)))                   
                index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components
                if msg != 'IGNORE':
                    aa = target_elst[index1]
                    bb = tf_elst[index1]
                    r_mix1, p_mix1 = stat.pearsonr(aa, bb)
                    aa = target_elst[index2]
                    bb = tf_elst[index2]
                    r_mix2, p_mix2 = stat.pearsonr(aa, bb)
                    #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2))
                    flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict)
                    flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict)
                    if flag1 or flag2:
                        d['significant']['mix'] =  {}
                        d['significant']['mix']['signal_set'] = []
                        d['significant']['mix']['score'] = []
                        d['significant']['mix']['chip_id']  = chip_id
                        if flag1:
                            d['significant']['mix']['signal_set'].append(subset_list(clist, index1))
                            d['significant']['mix']['score'].append(r_mix1)
                        if flag2:
                            d['significant']['mix']['signal_set'].append(subset_list(clist, index2))
                            d['significant']['mix']['score'].append(r_mix2)

            if K == 3: # three components
                aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components
                if msg != 'IGNORE':
                    r_mix1, p_mix1 = stat.pearsonr(aa1, bb1)
                    r_mix2, p_mix2 = stat.pearsonr(aa2, bb2)
                    r_mix3, p_mix3 = stat.pearsonr(aa3, bb3)
                    #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id))
                    #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1))
                    #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2))
                    #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3))
                    #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3)))
                    min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS'])
                    flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points
                    flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points
                    flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points                   
                    if flag1 or flag2 or flag3:
                        d['significant']['mix'] =  {}
                        d['significant']['mix']['signal_set'] = []
                        d['significant']['mix']['score'] = []
                        d['significant']['mix']['chip_id']  = chip_id
                        d['significant']['mix']['message'] = []
                        if flag1:
                            d['significant']['mix']['signal_set'].append(cond1)
                            d['significant']['mix']['score'].append(r_mix1)
                            d['significant']['mix']['message'].append(msg)
                        if flag2:
                            d['significant']['mix']['signal_set'].append(cond2)
                            d['significant']['mix']['score'].append(r_mix2)
                            d['significant']['mix']['message'].append(msg)                            
                        if flag3:
                            d['significant']['mix']['signal_set'].append(cond3)
                            d['significant']['mix']['score'].append(r_mix3)
                            d['significant']['mix']['message'].append(msg)                            
                            
        if len(d['significant']) > 0: # significant edges exist
            result_dict_lst.append(d)


    curr_time = datetime.now().strftime('%Y%m%d')
    fname = 'edges.txt' + '.' + curr_time +  '.' + target_gene_id
    if result_dict_lst != []:
        write_dict(result_dict_lst, glb_param_dict, fname)


def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict):
    ''' TBD '''
    return []


def make_small_expr_dict(lst, colid, rowid, dirname):
    d_return = {}
    d = {}
    for g in lst:
        fname = os.path.join(dirname, g + '.json')
        if os.path.exists(fname):
            with open(fname) as f:
                d[g] = json.load(f)

    d_return['colid'] = colid
    d_return['rowid'] = rowid    
    d_return['xy'] = d

    return d_return

def make_tf_dict(fname):
    d = {}
    f = open(fname)
    for line in f:
        line = line.strip()
        lst = line.split('\t')
        target = lst[0]
        tf     = lst[1]
        cond   = lst[2].split()
        if not target in d:
            d[target] = {tf:cond}
        else:
            d[target][tf] = cond
    f.close()
    return d


def establish_edges(jsonTPM_dir, expr_info_dict, rna_experiment_ids, rna_gene_ids, d, bind_info_dict, glb_param_dict):
    ''' d - binding dictionary {target:{tf:[c1,c2], tf:[c2,c3]}, ...  }  '''

    gene_lst = get_gene_list(glb_param_dict['GENE_LIST'])
    high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes
    update_global_param_dict(glb_param_dict, expr_info_dict)

    if not os.path.isdir(EDGE_DIR):
        os.makedirs(EDGE_DIR)
        
    process_lst = [] # a list of processes
    final_gene_lst = high_gene_lst
    for x in gene_lst:
        if not x in high_gene_lst:
            final_gene_lst.append(x)

    for g in final_gene_lst: # high priority genes are in the front
        if g in d: # target g is in binding dictionary d
            tf_dict = d[g]
            if len(tf_dict) > 0: # it has TFs, usually it is the case
                if glb_param_dict['TWO_WAY'] == 'YES':
                    small_gene_lst = tf_dict.keys()
                    small_gene_lst.append(g)
                    expr_dict = make_small_expr_dict(small_gene_lst, rna_experiment_ids, rna_gene_ids, jsonTPM_dir)
                    p = multiprocessing.Process(target=two_way, args=(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict))
                    process_lst.append(p)
                    p.start()
                    if len(process_lst) > MAX_NUM_PROCESS:
                        temp_lst = []
                        for p in process_lst:
                            p.join(timeout=60) # wait for subprocess to finish, wait for 60 secs, if it has not finished yet, check next subprocess
                            if p.is_alive():
                                temp_lst.append(p)
                        process_lst = temp_lst


########## main ##################################################
r.r['options'](warn=-1) # supress warning message from rpy2
warnings.filterwarnings("ignore")

param_file = sys.argv[1] # a single prameter file
glb_param_dict = make_global_param_dict(param_file)
agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
glb_param_dict['name_conversion_dict'] = agi2name_dict

#print('Read expression data')
cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory
os.system(cmd)
curr_time = datetime.now().strftime('%Y%m%d_%H%M')
JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time)
cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
os.system(cmd)

expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO'])
colid, rowid = read_experiment_id(glb_param_dict['EXPRESSION_MATRIX'])

#print('Read binding data')
cmd = 'python make_target_tf.py %s' % (param_file)  # make target_tf.txt
os.system(cmd)   # change
curr_time = datetime.now().strftime('%Y%m%d_%H%M')
target_tf_fname = 'target_tf.txt.' + curr_time
cmd = 'cp target_tf.txt %s' % (target_tf_fname)
os.system(cmd)
big_tf_dict = make_tf_dict(target_tf_fname)
bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])

#print('Establish edges')
establish_edges(JSON_DIR, expr_info_dict, colid, rowid, big_tf_dict, bind_info_dict, glb_param_dict)