From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/create_edges3.py | 615 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 615 insertions(+) create mode 100644 Code/create_edges3.py (limited to 'Code/create_edges3.py') diff --git a/Code/create_edges3.py b/Code/create_edges3.py new file mode 100644 index 0000000..3cd5ba4 --- /dev/null +++ b/Code/create_edges3.py @@ -0,0 +1,615 @@ +# Usage: python create_edges3.py parameter_for_net.txt +# +# Make it faster by spawning subprocesses. +# +# Make it smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first. +# So we don't need to load the big matrices, TPM.txt and binding.txt. +# +# 19 JAN 2017, hui, slcu + +import sys, os, operator, itertools +import numpy as np +import scipy.stats as stat +from datetime import datetime + +import rpy2.robjects as r +from rpy2.robjects.packages import importr +from rpy2.robjects import FloatVector + +import warnings + +import multiprocessing +import time + +import json +from geneid2name import make_gene_name_AGI_map_dict +from param4net import make_global_param_dict, get_gene_name + +EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene +GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' +MAX_NUM_PROCESS = 20 + + + +#################################### +DATA_SYMBOL = '@' +TOP_N_TF = 50 +MIN_NUM_CONDITION = 20 +SIGNAL_INPUT_RATIO_TAU = 1.5 +REMOVE_HORIZONTAL_STRIP_TAU = 0.01 +REMOVE_VERTICAL_STRIP_TAU = 0.01 +MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30 +#################################### + +def get_three_components_mixtools(y, x, cond_lst): + + # Remove NaNs or Infs + warn_msg = '' + sz = len(x) + if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore. + return None, None, None, None, None, None, None, None, None, 'IGNORE' + + index = np.isfinite(x) & np.isfinite(y) + if sum(index) < sz: + warn_msg = 'HAS_NAN_OR_INIFNITE' + if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: + return None, None, None, None, None, None, None, None, None, 'IGNORE' + + xx = np.array(x[index]) + yy = np.array(y[index]) + cond_lst2 = np.array(cond_lst) + cond_lst2 = cond_lst2[index] + + # Train the model + mixtools = importr('mixtools') + try: + result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100) + except: + return None, None, None, None, None, None, None, None, None, 'IGNORE' + posterior = result[result.names.index('posterior')] + posterior = np.array(posterior) + l = np.argmax(posterior, axis=1) # class information + idx1 = l == 0 + idx2 = l == 1 + idx3 = l == 2 + warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0]) + + return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg + + +def read_experiment_id(fname): + ''' Read column names (RNA-seq ids) and row names (gene ids) from TPM.txt ''' + + colid = [] + rowid = [] + + f = open(fname) + lines = f.readlines() + f.close() + + head_line = lines[0].strip() + lst = head_line.split() + colid = lst[1:] + rowid = [] + + for line in lines[1:]: + line = line.strip() + lst = line.split() + g = lst[0] + rowid.append(g) + + return (colid, rowid) + + +def read_matrix_data(fname): + ''' + fname - a file, first line is head, first column is row name. + ''' + + lineno = 0 + colid = [] + rowid = [] + d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...} + d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...} + d3 = {} # {gene1: [], gene2: [], ...} + d4 = {} # {cond1:[], cond2:[], ...} + + f = open(fname) + lines = f.readlines() + f.close() + + head_line = lines[0].strip() + lst = head_line.split() + colid = lst[1:] + + for c in colid: + d2[c] = {} + d4[c] = [] + + for line in lines[1:]: + line = line.strip() + lst = line.split() + g = lst[0] + rowid.append(g) + d[g] = {} + levels = lst[1:] + if len(levels) != len(colid): + print('Incomplete columns at row %s' % (g)) + sys.exit() + + d3[g] = [] + for i in range(len(colid)): + c = colid[i] + d[g][c] = float(levels[i]) + d2[c][g] = float(levels[i]) + d3[g].append(float(levels[i])) + d4[c].append(float(levels[i])) + lineno += 1 + + d_return = {} + d_return['xy'] = d # first gene, then condition + d_return['yx'] = d2 # first condition, then gene + d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row + d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column + d_return['nrow'] = lineno - 1 + d_return['ncol'] = len(colid) + d_return['rowid'] = rowid + d_return['colid'] = colid + + d4_sorted = {} + for k in d4: + d4_sorted[k] = sorted(d4[k], reverse=True) + d_return['yy_sorted'] = d4_sorted + + return d_return + + +def get_value(s, delimit): + lst = s.split(delimit) + return lst[1].strip() + + +def read_info_data(fname): + ''' Read chip-seq data information ''' + + if not os.path.exists(fname): + print('%s not exists.' % (fname) ) + sys.exit() + + d = {'ID_LIST':[]} + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#') or line.startswith('%'): + continue + if line.startswith(DATA_SYMBOL): + s = line[line.rfind(DATA_SYMBOL[-1])+1:] + s = s.strip() + if s in d: + print('ID %s duplicate' % (s)) + sys.exit() + d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} + d['ID_LIST'].append(s) + if line.startswith('DESCRIPTION:'): + d[s]['DESCRIPTION'] = get_value(line, ':') + elif line.startswith('PROTEN_NAME:'): + d[s]['PROTEN_NAME'] = get_value(line, ':') + elif line.startswith('PROTEIN_ID:'): + d[s]['PROTEIN_ID'] = get_value(line, ':') + elif line.startswith('DATA_NAME:'): + d[s]['DATA_NAME'] = get_value(line, ':') + elif line.startswith('DATA_FORMAT:'): + d[s]['DATA_FORMAT'] = get_value(line, ':') + elif line.startswith('LOCATION:'): + d[s]['LOCATION'] = get_value(line, ':') + elif line.startswith('NOTE:'): + d[s]['NOTE'] = get_value(line, ':') + + return d + + +def get_gene_list(fname): + result = [] + f = open(fname) + for line in f: + line = line.strip() + lst = line.split() + result.append(lst[0]) + f.close() + return result + + +def get_related_condition(s, info_dict): + lst = s.split(';') + result = [] # a list of sample IDs + result = info_dict['ID_LIST'] # TBD + return result + +def update_global_param_dict(glb_param_dict, info_dict): + if glb_param_dict['RESEARCH_KEYWORDS'] == '': + glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST'] + glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict) + + +def float_equal(x, y): + return np.abs(x-y) < 0.001 + + +def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False): + ''' get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. ''' + num_cond = len(cond_lst) + elst1 = [None]*num_cond + elst2 = [None]*num_cond + clst = [None]*num_cond + d = expr_dict['xy'] + j = 0 + for i in range(num_cond): + c = cond_lst[i] + x = expr_dict['xy'][gene_id1][c] + y = expr_dict['xy'][gene_id2][c] + #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c)) + #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX'])) + if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero + if takelog == True: # increase gene expression uniformly by 1 for taking logarithm + elst1[j] = np.log(x+1) + elst2[j] = np.log(y+1) + else: + elst1[j] = x + elst2[j] = y + clst[j] = c + j += 1 + return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] ) + + +def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False): + ''' + get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. + In addition, vertical strip (as seen in their scatter plot) and horizontal strip are removed. + ''' + num_cond = len(cond_lst) + elst1 = [None]*num_cond + elst2 = [None]*num_cond + mark_cond = [True]*num_cond # indicate if a condition should be included + clst = [] + d = expr_dict['xy'] + + for i in range(num_cond): + c = cond_lst[i] + x = expr_dict['xy'][gene_id1][c] + y = expr_dict['xy'][gene_id2][c] + if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero + if takelog == True: # increase gene expression uniformly by 1 for taking logarithm + elst1[i] = np.log(x+1) + elst2[i] = np.log(y+1) + else: + elst1[i] = x + elst2[i] = y + if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip + mark_cond[i] = False + else: + clst.append(c) + else: + mark_cond[i] = False + + a = np.array(elst1) + a = a[mark_cond] + b = np.array(elst2) + b = b[mark_cond] + return (a.astype(np.float64), b.astype(np.float64), clst) + + +#################### select condition stuff ############### + +def common_elements(list1, list2): + return sorted(list(set(list1).intersection(list2))) + +def correlation_is_significant(r, p, glb_param_dict): + return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF']) + +def subset_list(lst, bool_index): + if len(lst) != len(bool_index): + print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index)) + sys.exit() + + n = len(lst) + result = [] + for i in range(n): + if bool_index[i] == True: + result.append(lst[i]) + return result + + +def write_dict(d_lst, glb_param_dict, fname): + ''' write a list of dictionary content to fname. Each dictionary contains a target-TF edge. ''' + agi2name_dict = glb_param_dict['name_conversion_dict'] + curr_date = datetime.now().strftime('%Y%m%d_%H%M%S') # add date to the end of each line, for future reference or filtering + fname = os.path.join(EDGE_DIR, fname) + f = open(fname, 'w') + for d in d_lst: + if d['type'] == 'two-way': + gene_id = d['target'] + head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict)) + gene_id = d['TF'] + head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict)) + d2 = d['significant'] + if 'all' in d2: + s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.') + s += '\t' + curr_date + f.write(head + s + '\n') + if 'mix' in d2: + n = len(d2['mix']['score']) + for i in range(n): + s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i]) + s += '\t' + curr_date + f.write(head + s + '\n') + f.close() + + +def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict): + ''' + + Check if target has relationship with each of TFs. + + tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST) + + Return a list of dictionaries. Each dictionary has the following format: + + 'type' :'two-way' + 'target':'' + 'TF' :'' + 'significant': { + 'all': {'signal_set':[], score=2, chip_id:''} + 'pos_direction':{'signal_set':[], score:.0, chip_id:''} + 'neg_direction':{'signal_set':[], score:.1, chip_id:''} + 'user_defined': {'signal_set':[], score:.3, chip_id:''} + 'mix': {'signal_set':[], score:[.7,-.5], chip_id:''} + } + + ''' + + result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair + + target_gene_id = target + all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples. TBD, can be glb_param_dict['USER_CONDITION_LIST'] + logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values + + if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything + return None + + for tf_gene_id in sorted(tf_dict.keys()): # y is a TF gene id + + chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000 + + if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything + continue + + # get gene expression profiles for target and TF. If in a RNA-seq sample, both target and TF is 0, then this sample is ignored. + target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize) + + r, p = stat.pearsonr(target_elst, tf_elst) + + d = {} + d['target'] = target_gene_id + d['TF'] = tf_gene_id + d['type'] = 'two-way' + d['significant'] = {} + + all_good = False + if correlation_is_significant(r, p, glb_param_dict): + d['significant']['all'] = {} + d['significant']['all']['signal_set'] = clist # a list of sample IDs, returned by get_gene_expression3 + d['significant']['all']['score'] = r + d['significant']['all']['chip_id'] = chip_id + all_good = True + + user_cond_lst = glb_param_dict['USER_CONDITION_LIST'] + if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []: + target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize) + + r, p = stat.pearsonr(target_elst_user, tf_elst_user) + if correlation_is_significant(r, p, glb_param_dict): + d['significant']['user'] = {} + d['significant']['user']['signal_set'] = user_cond_lst + d['significant']['user']['score'] = r + d['significant']['user']['chip_id'] = chip_id + + # obsolete + max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF'] + if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES': + aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos') + r_pos, p_pos = stat.pearsonr(aa, bb) + if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION: + d['significant']['pos'] = {} + d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos) + d['significant']['pos']['score'] = r_pos + d['significant']['pos']['chip_id'] = chip_id + + # obsolete + if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES': + aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg') + r_neg, p_neg = stat.pearsonr(aa, bb) + if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION: + d['significant']['neg'] = {} + d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg) + d['significant']['neg']['score'] = r_neg + d['significant']['neg']['chip_id'] = chip_id + + K = int(glb_param_dict['NUMBER_OF_COMPONENTS']) + if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results + if K == 2: # for now consider two components + #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst))) + #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist))) + index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components + if msg != 'IGNORE': + aa = target_elst[index1] + bb = tf_elst[index1] + r_mix1, p_mix1 = stat.pearsonr(aa, bb) + aa = target_elst[index2] + bb = tf_elst[index2] + r_mix2, p_mix2 = stat.pearsonr(aa, bb) + #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2)) + flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) + flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) + if flag1 or flag2: + d['significant']['mix'] = {} + d['significant']['mix']['signal_set'] = [] + d['significant']['mix']['score'] = [] + d['significant']['mix']['chip_id'] = chip_id + if flag1: + d['significant']['mix']['signal_set'].append(subset_list(clist, index1)) + d['significant']['mix']['score'].append(r_mix1) + if flag2: + d['significant']['mix']['signal_set'].append(subset_list(clist, index2)) + d['significant']['mix']['score'].append(r_mix2) + + if K == 3: # three components + aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components + if msg != 'IGNORE': + r_mix1, p_mix1 = stat.pearsonr(aa1, bb1) + r_mix2, p_mix2 = stat.pearsonr(aa2, bb2) + r_mix3, p_mix3 = stat.pearsonr(aa3, bb3) + #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id)) + #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1)) + #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2)) + #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3)) + #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3))) + min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS']) + flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points + flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points + flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points + if flag1 or flag2 or flag3: + d['significant']['mix'] = {} + d['significant']['mix']['signal_set'] = [] + d['significant']['mix']['score'] = [] + d['significant']['mix']['chip_id'] = chip_id + d['significant']['mix']['message'] = [] + if flag1: + d['significant']['mix']['signal_set'].append(cond1) + d['significant']['mix']['score'].append(r_mix1) + d['significant']['mix']['message'].append(msg) + if flag2: + d['significant']['mix']['signal_set'].append(cond2) + d['significant']['mix']['score'].append(r_mix2) + d['significant']['mix']['message'].append(msg) + if flag3: + d['significant']['mix']['signal_set'].append(cond3) + d['significant']['mix']['score'].append(r_mix3) + d['significant']['mix']['message'].append(msg) + + if len(d['significant']) > 0: # significant edges exist + result_dict_lst.append(d) + + + curr_time = datetime.now().strftime('%Y%m%d') + fname = 'edges.txt' + '.' + curr_time + '.' + target_gene_id + if result_dict_lst != []: + write_dict(result_dict_lst, glb_param_dict, fname) + + +def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict): + ''' TBD ''' + return [] + + +def make_small_expr_dict(lst, colid, rowid, dirname): + d_return = {} + d = {} + for g in lst: + fname = os.path.join(dirname, g + '.json') + if os.path.exists(fname): + with open(fname) as f: + d[g] = json.load(f) + + d_return['colid'] = colid + d_return['rowid'] = rowid + d_return['xy'] = d + + return d_return + +def make_tf_dict(fname): + d = {} + f = open(fname) + for line in f: + line = line.strip() + lst = line.split('\t') + target = lst[0] + tf = lst[1] + cond = lst[2].split() + if not target in d: + d[target] = {tf:cond} + else: + d[target][tf] = cond + f.close() + return d + + +def establish_edges(jsonTPM_dir, expr_info_dict, rna_experiment_ids, rna_gene_ids, d, bind_info_dict, glb_param_dict): + ''' d - binding dictionary {target:{tf:[c1,c2], tf:[c2,c3]}, ... } ''' + + gene_lst = get_gene_list(glb_param_dict['GENE_LIST']) + high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes + update_global_param_dict(glb_param_dict, expr_info_dict) + + if not os.path.isdir(EDGE_DIR): + os.makedirs(EDGE_DIR) + + process_lst = [] # a list of processes + final_gene_lst = high_gene_lst + for x in gene_lst: + if not x in high_gene_lst: + final_gene_lst.append(x) + + for g in final_gene_lst: # high priority genes are in the front + if g in d: # target g is in binding dictionary d + tf_dict = d[g] + if len(tf_dict) > 0: # it has TFs, usually it is the case + if glb_param_dict['TWO_WAY'] == 'YES': + small_gene_lst = tf_dict.keys() + small_gene_lst.append(g) + expr_dict = make_small_expr_dict(small_gene_lst, rna_experiment_ids, rna_gene_ids, jsonTPM_dir) + p = multiprocessing.Process(target=two_way, args=(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict)) + process_lst.append(p) + p.start() + if len(process_lst) > MAX_NUM_PROCESS: + temp_lst = [] + for p in process_lst: + p.join(timeout=60) # wait for subprocess to finish, wait for 60 secs, if it has not finished yet, check next subprocess + if p.is_alive(): + temp_lst.append(p) + process_lst = temp_lst + + +########## main ################################################## +r.r['options'](warn=-1) # supress warning message from rpy2 +warnings.filterwarnings("ignore") + +param_file = sys.argv[1] # a single prameter file +glb_param_dict = make_global_param_dict(param_file) +agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) +glb_param_dict['name_conversion_dict'] = agi2name_dict + +#print('Read expression data') +cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory +os.system(cmd) +curr_time = datetime.now().strftime('%Y%m%d_%H%M') +JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) +cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR) +os.system(cmd) + +expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO']) +colid, rowid = read_experiment_id(glb_param_dict['EXPRESSION_MATRIX']) + +#print('Read binding data') +cmd = 'python make_target_tf.py %s' % (param_file) # make target_tf.txt +os.system(cmd) # change +curr_time = datetime.now().strftime('%Y%m%d_%H%M') +target_tf_fname = 'target_tf.txt.' + curr_time +cmd = 'cp target_tf.txt %s' % (target_tf_fname) +os.system(cmd) +big_tf_dict = make_tf_dict(target_tf_fname) +bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO']) + +#print('Establish edges') +establish_edges(JSON_DIR, expr_info_dict, colid, rowid, big_tf_dict, bind_info_dict, glb_param_dict) -- cgit v1.2.1