# Usage: python create_edges3.py parameter_for_net.txt # # Make it faster by spawning subprocesses. # # Make it smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first. # So we don't need to load the big matrices, TPM.txt and binding.txt. # # 19 JAN 2017, hui, slcu import sys, os, operator, itertools import numpy as np import scipy.stats as stat from datetime import datetime import rpy2.robjects as r from rpy2.robjects.packages import importr from rpy2.robjects import FloatVector import warnings import multiprocessing import time import json from geneid2name import make_gene_name_AGI_map_dict from param4net import make_global_param_dict, get_gene_name EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' MAX_NUM_PROCESS = 20 #################################### DATA_SYMBOL = '@' TOP_N_TF = 50 MIN_NUM_CONDITION = 20 SIGNAL_INPUT_RATIO_TAU = 1.5 REMOVE_HORIZONTAL_STRIP_TAU = 0.01 REMOVE_VERTICAL_STRIP_TAU = 0.01 MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30 #################################### def get_three_components_mixtools(y, x, cond_lst): # Remove NaNs or Infs warn_msg = '' sz = len(x) if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore. return None, None, None, None, None, None, None, None, None, 'IGNORE' index = np.isfinite(x) & np.isfinite(y) if sum(index) < sz: warn_msg = 'HAS_NAN_OR_INIFNITE' if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: return None, None, None, None, None, None, None, None, None, 'IGNORE' xx = np.array(x[index]) yy = np.array(y[index]) cond_lst2 = np.array(cond_lst) cond_lst2 = cond_lst2[index] # Train the model mixtools = importr('mixtools') try: result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100) except: return None, None, None, None, None, None, None, None, None, 'IGNORE' posterior = result[result.names.index('posterior')] posterior = np.array(posterior) l = np.argmax(posterior, axis=1) # class information idx1 = l == 0 idx2 = l == 1 idx3 = l == 2 warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0]) return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg def read_experiment_id(fname): ''' Read column names (RNA-seq ids) and row names (gene ids) from TPM.txt ''' colid = [] rowid = [] f = open(fname) lines = f.readlines() f.close() head_line = lines[0].strip() lst = head_line.split() colid = lst[1:] rowid = [] for line in lines[1:]: line = line.strip() lst = line.split() g = lst[0] rowid.append(g) return (colid, rowid) def read_matrix_data(fname): ''' fname - a file, first line is head, first column is row name. ''' lineno = 0 colid = [] rowid = [] d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...} d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...} d3 = {} # {gene1: [], gene2: [], ...} d4 = {} # {cond1:[], cond2:[], ...} f = open(fname) lines = f.readlines() f.close() head_line = lines[0].strip() lst = head_line.split() colid = lst[1:] for c in colid: d2[c] = {} d4[c] = [] for line in lines[1:]: line = line.strip() lst = line.split() g = lst[0] rowid.append(g) d[g] = {} levels = lst[1:] if len(levels) != len(colid): print('Incomplete columns at row %s' % (g)) sys.exit() d3[g] = [] for i in range(len(colid)): c = colid[i] d[g][c] = float(levels[i]) d2[c][g] = float(levels[i]) d3[g].append(float(levels[i])) d4[c].append(float(levels[i])) lineno += 1 d_return = {} d_return['xy'] = d # first gene, then condition d_return['yx'] = d2 # first condition, then gene d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column d_return['nrow'] = lineno - 1 d_return['ncol'] = len(colid) d_return['rowid'] = rowid d_return['colid'] = colid d4_sorted = {} for k in d4: d4_sorted[k] = sorted(d4[k], reverse=True) d_return['yy_sorted'] = d4_sorted return d_return def get_value(s, delimit): lst = s.split(delimit) return lst[1].strip() def read_info_data(fname): ''' Read chip-seq data information ''' if not os.path.exists(fname): print('%s not exists.' % (fname) ) sys.exit() d = {'ID_LIST':[]} f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line == '' or line.startswith('#') or line.startswith('%'): continue if line.startswith(DATA_SYMBOL): s = line[line.rfind(DATA_SYMBOL[-1])+1:] s = s.strip() if s in d: print('ID %s duplicate' % (s)) sys.exit() d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} d['ID_LIST'].append(s) if line.startswith('DESCRIPTION:'): d[s]['DESCRIPTION'] = get_value(line, ':') elif line.startswith('PROTEN_NAME:'): d[s]['PROTEN_NAME'] = get_value(line, ':') elif line.startswith('PROTEIN_ID:'): d[s]['PROTEIN_ID'] = get_value(line, ':') elif line.startswith('DATA_NAME:'): d[s]['DATA_NAME'] = get_value(line, ':') elif line.startswith('DATA_FORMAT:'): d[s]['DATA_FORMAT'] = get_value(line, ':') elif line.startswith('LOCATION:'): d[s]['LOCATION'] = get_value(line, ':') elif line.startswith('NOTE:'): d[s]['NOTE'] = get_value(line, ':') return d def get_gene_list(fname): result = [] f = open(fname) for line in f: line = line.strip() lst = line.split() result.append(lst[0]) f.close() return result def get_related_condition(s, info_dict): lst = s.split(';') result = [] # a list of sample IDs result = info_dict['ID_LIST'] # TBD return result def update_global_param_dict(glb_param_dict, info_dict): if glb_param_dict['RESEARCH_KEYWORDS'] == '': glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST'] glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict) def float_equal(x, y): return np.abs(x-y) < 0.001 def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False): ''' get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. ''' num_cond = len(cond_lst) elst1 = [None]*num_cond elst2 = [None]*num_cond clst = [None]*num_cond d = expr_dict['xy'] j = 0 for i in range(num_cond): c = cond_lst[i] x = expr_dict['xy'][gene_id1][c] y = expr_dict['xy'][gene_id2][c] #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c)) #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX'])) if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero if takelog == True: # increase gene expression uniformly by 1 for taking logarithm elst1[j] = np.log(x+1) elst2[j] = np.log(y+1) else: elst1[j] = x elst2[j] = y clst[j] = c j += 1 return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] ) def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False): ''' get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. In addition, vertical strip (as seen in their scatter plot) and horizontal strip are removed. ''' num_cond = len(cond_lst) elst1 = [None]*num_cond elst2 = [None]*num_cond mark_cond = [True]*num_cond # indicate if a condition should be included clst = [] d = expr_dict['xy'] for i in range(num_cond): c = cond_lst[i] x = expr_dict['xy'][gene_id1][c] y = expr_dict['xy'][gene_id2][c] if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero if takelog == True: # increase gene expression uniformly by 1 for taking logarithm elst1[i] = np.log(x+1) elst2[i] = np.log(y+1) else: elst1[i] = x elst2[i] = y if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip mark_cond[i] = False else: clst.append(c) else: mark_cond[i] = False a = np.array(elst1) a = a[mark_cond] b = np.array(elst2) b = b[mark_cond] return (a.astype(np.float64), b.astype(np.float64), clst) #################### select condition stuff ############### def common_elements(list1, list2): return sorted(list(set(list1).intersection(list2))) def correlation_is_significant(r, p, glb_param_dict): return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF']) def subset_list(lst, bool_index): if len(lst) != len(bool_index): print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index)) sys.exit() n = len(lst) result = [] for i in range(n): if bool_index[i] == True: result.append(lst[i]) return result def write_dict(d_lst, glb_param_dict, fname): ''' write a list of dictionary content to fname. Each dictionary contains a target-TF edge. ''' agi2name_dict = glb_param_dict['name_conversion_dict'] curr_date = datetime.now().strftime('%Y%m%d_%H%M%S') # add date to the end of each line, for future reference or filtering fname = os.path.join(EDGE_DIR, fname) f = open(fname, 'w') for d in d_lst: if d['type'] == 'two-way': gene_id = d['target'] head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict)) gene_id = d['TF'] head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict)) d2 = d['significant'] if 'all' in d2: s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.') s += '\t' + curr_date f.write(head + s + '\n') if 'mix' in d2: n = len(d2['mix']['score']) for i in range(n): s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i]) s += '\t' + curr_date f.write(head + s + '\n') f.close() def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict): ''' Check if target has relationship with each of TFs. tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST) Return a list of dictionaries. Each dictionary has the following format: 'type' :'two-way' 'target':'' 'TF' :'' 'significant': { 'all': {'signal_set':[], score=2, chip_id:''} 'pos_direction':{'signal_set':[], score:.0, chip_id:''} 'neg_direction':{'signal_set':[], score:.1, chip_id:''} 'user_defined': {'signal_set':[], score:.3, chip_id:''} 'mix': {'signal_set':[], score:[.7,-.5], chip_id:''} } ''' result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair target_gene_id = target all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples. TBD, can be glb_param_dict['USER_CONDITION_LIST'] logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything return None for tf_gene_id in sorted(tf_dict.keys()): # y is a TF gene id chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000 if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything continue # get gene expression profiles for target and TF. If in a RNA-seq sample, both target and TF is 0, then this sample is ignored. target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize) r, p = stat.pearsonr(target_elst, tf_elst) d = {} d['target'] = target_gene_id d['TF'] = tf_gene_id d['type'] = 'two-way' d['significant'] = {} all_good = False if correlation_is_significant(r, p, glb_param_dict): d['significant']['all'] = {} d['significant']['all']['signal_set'] = clist # a list of sample IDs, returned by get_gene_expression3 d['significant']['all']['score'] = r d['significant']['all']['chip_id'] = chip_id all_good = True user_cond_lst = glb_param_dict['USER_CONDITION_LIST'] if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []: target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize) r, p = stat.pearsonr(target_elst_user, tf_elst_user) if correlation_is_significant(r, p, glb_param_dict): d['significant']['user'] = {} d['significant']['user']['signal_set'] = user_cond_lst d['significant']['user']['score'] = r d['significant']['user']['chip_id'] = chip_id # obsolete max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF'] if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES': aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos') r_pos, p_pos = stat.pearsonr(aa, bb) if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION: d['significant']['pos'] = {} d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos) d['significant']['pos']['score'] = r_pos d['significant']['pos']['chip_id'] = chip_id # obsolete if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES': aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg') r_neg, p_neg = stat.pearsonr(aa, bb) if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION: d['significant']['neg'] = {} d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg) d['significant']['neg']['score'] = r_neg d['significant']['neg']['chip_id'] = chip_id K = int(glb_param_dict['NUMBER_OF_COMPONENTS']) if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results if K == 2: # for now consider two components #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst))) #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist))) index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components if msg != 'IGNORE': aa = target_elst[index1] bb = tf_elst[index1] r_mix1, p_mix1 = stat.pearsonr(aa, bb) aa = target_elst[index2] bb = tf_elst[index2] r_mix2, p_mix2 = stat.pearsonr(aa, bb) #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2)) flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) if flag1 or flag2: d['significant']['mix'] = {} d['significant']['mix']['signal_set'] = [] d['significant']['mix']['score'] = [] d['significant']['mix']['chip_id'] = chip_id if flag1: d['significant']['mix']['signal_set'].append(subset_list(clist, index1)) d['significant']['mix']['score'].append(r_mix1) if flag2: d['significant']['mix']['signal_set'].append(subset_list(clist, index2)) d['significant']['mix']['score'].append(r_mix2) if K == 3: # three components aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components if msg != 'IGNORE': r_mix1, p_mix1 = stat.pearsonr(aa1, bb1) r_mix2, p_mix2 = stat.pearsonr(aa2, bb2) r_mix3, p_mix3 = stat.pearsonr(aa3, bb3) #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id)) #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1)) #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2)) #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3)) #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3))) min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS']) flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points if flag1 or flag2 or flag3: d['significant']['mix'] = {} d['significant']['mix']['signal_set'] = [] d['significant']['mix']['score'] = [] d['significant']['mix']['chip_id'] = chip_id d['significant']['mix']['message'] = [] if flag1: d['significant']['mix']['signal_set'].append(cond1) d['significant']['mix']['score'].append(r_mix1) d['significant']['mix']['message'].append(msg) if flag2: d['significant']['mix']['signal_set'].append(cond2) d['significant']['mix']['score'].append(r_mix2) d['significant']['mix']['message'].append(msg) if flag3: d['significant']['mix']['signal_set'].append(cond3) d['significant']['mix']['score'].append(r_mix3) d['significant']['mix']['message'].append(msg) if len(d['significant']) > 0: # significant edges exist result_dict_lst.append(d) curr_time = datetime.now().strftime('%Y%m%d') fname = 'edges.txt' + '.' + curr_time + '.' + target_gene_id if result_dict_lst != []: write_dict(result_dict_lst, glb_param_dict, fname) def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict): ''' TBD ''' return [] def make_small_expr_dict(lst, colid, rowid, dirname): d_return = {} d = {} for g in lst: fname = os.path.join(dirname, g + '.json') if os.path.exists(fname): with open(fname) as f: d[g] = json.load(f) d_return['colid'] = colid d_return['rowid'] = rowid d_return['xy'] = d return d_return def make_tf_dict(fname): d = {} f = open(fname) for line in f: line = line.strip() lst = line.split('\t') target = lst[0] tf = lst[1] cond = lst[2].split() if not target in d: d[target] = {tf:cond} else: d[target][tf] = cond f.close() return d def establish_edges(jsonTPM_dir, expr_info_dict, rna_experiment_ids, rna_gene_ids, d, bind_info_dict, glb_param_dict): ''' d - binding dictionary {target:{tf:[c1,c2], tf:[c2,c3]}, ... } ''' gene_lst = get_gene_list(glb_param_dict['GENE_LIST']) high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes update_global_param_dict(glb_param_dict, expr_info_dict) if not os.path.isdir(EDGE_DIR): os.makedirs(EDGE_DIR) process_lst = [] # a list of processes final_gene_lst = high_gene_lst for x in gene_lst: if not x in high_gene_lst: final_gene_lst.append(x) for g in final_gene_lst: # high priority genes are in the front if g in d: # target g is in binding dictionary d tf_dict = d[g] if len(tf_dict) > 0: # it has TFs, usually it is the case if glb_param_dict['TWO_WAY'] == 'YES': small_gene_lst = tf_dict.keys() small_gene_lst.append(g) expr_dict = make_small_expr_dict(small_gene_lst, rna_experiment_ids, rna_gene_ids, jsonTPM_dir) p = multiprocessing.Process(target=two_way, args=(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict)) process_lst.append(p) p.start() if len(process_lst) > MAX_NUM_PROCESS: temp_lst = [] for p in process_lst: p.join(timeout=60) # wait for subprocess to finish, wait for 60 secs, if it has not finished yet, check next subprocess if p.is_alive(): temp_lst.append(p) process_lst = temp_lst ########## main ################################################## r.r['options'](warn=-1) # supress warning message from rpy2 warnings.filterwarnings("ignore") param_file = sys.argv[1] # a single prameter file glb_param_dict = make_global_param_dict(param_file) agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) glb_param_dict['name_conversion_dict'] = agi2name_dict #print('Read expression data') cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory os.system(cmd) curr_time = datetime.now().strftime('%Y%m%d_%H%M') JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR) os.system(cmd) expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO']) colid, rowid = read_experiment_id(glb_param_dict['EXPRESSION_MATRIX']) #print('Read binding data') cmd = 'python make_target_tf.py %s' % (param_file) # make target_tf.txt os.system(cmd) # change curr_time = datetime.now().strftime('%Y%m%d_%H%M') target_tf_fname = 'target_tf.txt.' + curr_time cmd = 'cp target_tf.txt %s' % (target_tf_fname) os.system(cmd) big_tf_dict = make_tf_dict(target_tf_fname) bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO']) #print('Establish edges') establish_edges(JSON_DIR, expr_info_dict, colid, rowid, big_tf_dict, bind_info_dict, glb_param_dict)