summaryrefslogtreecommitdiff
path: root/Code/create_edges.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/create_edges.py')
-rw-r--r--Code/create_edges.py840
1 files changed, 840 insertions, 0 deletions
diff --git a/Code/create_edges.py b/Code/create_edges.py
new file mode 100644
index 0000000..bacf179
--- /dev/null
+++ b/Code/create_edges.py
@@ -0,0 +1,840 @@
+# Usage: python create_edges.py parameter_for_net.txt > edges.txt.20170227_1618
+#
+# 01 DEC 2016, hui
+
+import sys, os, operator, itertools
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.stats as stat
+from datetime import datetime
+
+import rpy2.robjects as r
+from rpy2.robjects.packages import importr
+from rpy2.robjects import FloatVector
+
+import warnings
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+from param4net import make_global_param_dict
+
+####### Utility files #############
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+TOP_N_TF = 50
+MIN_NUM_CONDITION = 20
+SIGNAL_INPUT_RATIO_TAU = 1.5
+REMOVE_HORIZONTAL_STRIP_TAU = 0.05
+REMOVE_VERTICAL_STRIP_TAU = 0.05
+MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30
+####################################
+
+def get_two_components(y, x):
+ K = 2
+ epsilon = 1e-4
+ lam = 0.1
+ iterations = 25
+ random_restarts = 2
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, 'IGNORE'
+ # print('DEBUG')
+ # print(y)
+ # print(x)
+ # print(type(y))
+ # print(type(x))
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = np.array_str(x) + ',' + np.array_str(y)
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, 'IGNORE'
+
+ x = x[index]
+ y = y[index]
+
+ # Train the model
+ model = LinearRegressionsMixture(np.expand_dims(x, axis=1), np.expand_dims(y, axis=1), K=K)
+ model.train(epsilon=epsilon, lam=lam, iterations=iterations, random_restarts=random_restarts, verbose=False)
+ idx1 = (model.gamma[:,0] > model.gamma[:,1]) # model.gamma is a vector of posterior probabilities
+ idx2 = (model.gamma[:,1] > model.gamma[:,0])
+ return idx1, idx2, warn_msg
+
+
+def get_three_components(y, x, cond_lst):
+ K = 3
+ epsilon = 1e-4
+ lam = 0.1
+ iterations = 50
+ random_restarts = 5
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+
+ # Train the model
+ model = LinearRegressionsMixture(np.expand_dims(xx, axis=1), np.expand_dims(yy, axis=1), K=K)
+ model.train(epsilon=epsilon, lam=lam, iterations=iterations, random_restarts=random_restarts, verbose=False)
+ idx1 = np.array(model.gamma[:,0] > model.gamma[:,1]) & np.array(model.gamma[:,0] > model.gamma[:,2]) # model.gamma is a vector of posterior probabilities
+ idx2 = np.array(model.gamma[:,1] > model.gamma[:,0]) & np.array(model.gamma[:,1] > model.gamma[:,2])
+ idx3 = np.array(model.gamma[:,2] > model.gamma[:,0]) & np.array(model.gamma[:,2] > model.gamma[:,1])
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def get_three_components_and_evaluate(y, x, cond_lst):
+ K = 3
+ epsilon = 1e-4
+ lam = 0.1
+ iterations = 50
+ random_restarts = 5
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+ # Train the model
+ model = LinearRegressionsMixture(np.expand_dims(xx, axis=1), np.expand_dims(yy, axis=1), K=K)
+ model.train(epsilon=epsilon, lam=lam, iterations=iterations, random_restarts=random_restarts, verbose=False)
+ idx1 = np.array(model.gamma[:,0] > model.gamma[:,1]) & np.array(model.gamma[:,0] > model.gamma[:,2]) # model.gamma is a vector of posterior probabilities
+ idx2 = np.array(model.gamma[:,1] > model.gamma[:,0]) & np.array(model.gamma[:,1] > model.gamma[:,2])
+ idx3 = np.array(model.gamma[:,2] > model.gamma[:,0]) & np.array(model.gamma[:,2] > model.gamma[:,1])
+ rmse_avg, rmse_std = model.cross_validate(k_fold=10, verbose=False, silent=True)
+ warn_msg = 'rmse_avg=%4.2f,rmse_sd=%4.2f' % (rmse_avg, rmse_std)
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def get_three_components_mixtools(y, x, cond_lst):
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+ # Train the model
+ mixtools = importr('mixtools')
+ try:
+ result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100)
+ except:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+ posterior = result[result.names.index('posterior')]
+ posterior = np.array(posterior)
+ l = np.argmax(posterior, axis=1) # class information
+ idx1 = l == 0
+ idx2 = l == 1
+ idx3 = l == 2
+ warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0])
+
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True)
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+def read_info_data(fname):
+ ''' Read chip-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def get_related_condition(s, info_dict):
+ lst = s.split(';')
+ result = [] # a list of sample IDs
+ result = info_dict['ID_LIST'] # TBD
+ return result
+
+
+def update_global_param_dict(glb_param_dict, info_dict):
+ if glb_param_dict['RESEARCH_KEYWORDS'] == '':
+ glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST']
+ glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict)
+
+
+def get_threshold(lst):
+ x = np.array(lst)
+ x = x[x > 0]
+ return np.median(x)
+
+
+def get_threshold2(lst, glb_param_dict):
+ x = np.array(lst)
+ x = x[x > 0]
+ max_num = int(glb_param_dict['MAX_NUM_TARGETS'])
+ percent = float(glb_param_dict['OVERFLOW_TARGETS_PERCENTAGE'])
+ n = len(x)
+ if n < max_num:
+ return x[-1]
+ else: # include some overflowing targets, but not all
+ overflow = n - max_num
+ keep = int(overflow * percent)
+ index = keep + max_num
+ return x[index]
+
+def get_tf(g, bind_dict, info_dict, input_dict, glb_param_dict):
+ tf_dict = {}
+ d = bind_dict['xy']
+ input_d = input_dict['xy']
+ input_cond = input_dict['colid'][0] # use the first column as input (improve)
+ if g in d.keys():
+ for c in bind_dict['colid']:
+ bind_val = d[g][c]
+ if info_dict[c]['DATA_FORMAT'].upper() == 'BW':
+ input_val = input_d[g][input_cond]
+ if g == 'AT1G65480': # FT, target is FT
+ #print('DEBUG target:%s protein=%s bv=%g, iv=%g, ratio=%g' % (g, info_dict[c]['PROTEIN_ID'], bind_val, input_val, bind_val/input_val))
+ pass
+ if input_val > 0 and input_val < 10000 and (bind_val / input_val) > SIGNAL_INPUT_RATIO_TAU: # input_val should also be not too large
+ g2 = info_dict[c]['PROTEIN_ID']
+ if g2 != '':
+ if not g2 in tf_dict:
+ tf_dict[g2] = [c]
+ else:
+ tf_dict[g2].append(c)
+ elif info_dict[c]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ #tau = bind_dict['yy_sorted'][c][TOP_N_TF]
+ tau = get_threshold2(bind_dict['yy_sorted'][c], glb_param_dict)
+ #print('DEBUG target=%s %s %g >= %g' % (g, info_dict[c]['PROTEIN_ID'], bind_val, tau))
+ if bind_val >= tau: # change later
+ g2 = info_dict[c]['PROTEIN_ID']
+ if g2 != '':
+ if not g2 in tf_dict:
+ tf_dict[g2] = [c]
+ else:
+ tf_dict[g2].append(c)
+
+ return tf_dict
+
+
+
+def get_gene_expression(gene_id, cond_lst, expr_dict, takelog=False):
+
+ num_cond = len(cond_lst)
+ elst = [None]*num_cond
+ d = expr_dict['xy']
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = d[gene_id][c]
+ if takelog == True:
+ elst[i] = np.log(x+1)
+ else:
+ elst[i] = x
+ return np.array( elst )
+
+
+def float_equal(x, y):
+ return np.abs(x-y) < 0.001
+
+
+def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+ ''' get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. '''
+ num_cond = len(cond_lst)
+ elst1 = [None]*num_cond
+ elst2 = [None]*num_cond
+ clst = [None]*num_cond
+ d = expr_dict['xy']
+ j = 0
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = expr_dict['xy'][gene_id1][c]
+ y = expr_dict['xy'][gene_id2][c]
+ #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c))
+ #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
+ if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+ if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+ elst1[j] = np.log(x+1)
+ elst2[j] = np.log(y+1)
+ else:
+ elst1[j] = x
+ elst2[j] = y
+ clst[j] = c
+ j += 1
+ return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] )
+
+
+
+def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+ '''
+ get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored.
+ in addition, vertical strip and horizontal strip are removed.
+ '''
+ num_cond = len(cond_lst)
+ elst1 = [None]*num_cond
+ elst2 = [None]*num_cond
+ mark_cond = [True]*num_cond # indicate if a condition should be included
+ clst = []
+ d = expr_dict['xy']
+
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = expr_dict['xy'][gene_id1][c]
+ y = expr_dict['xy'][gene_id2][c]
+ if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+ if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+ elst1[i] = np.log(x+1)
+ elst2[i] = np.log(y+1)
+ else:
+ elst1[i] = x
+ elst2[i] = y
+ if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip
+ mark_cond[i] = False
+ else:
+ clst.append(c)
+ else:
+ mark_cond[i] = False
+
+ a = np.array(elst1)
+ a = a[mark_cond]
+ b = np.array(elst2)
+ b = b[mark_cond]
+ return (a.astype(np.float64), b.astype(np.float64), clst)
+
+
+#################### select condition stuff ###############
+
+def get_yhat(slope, intercept, x):
+ yhat = intercept + slope * x
+ return yhat
+
+def select_points_theil(x, y, max_diff):
+ theil_result = stat.mstats.theilslopes(x, y)
+ slope = theil_result[0]
+ intercept = theil_result[1]
+ yhat = get_yhat(slope, intercept, x)
+ d = y - yhat
+ d_abs = np.abs(d)
+ index = d_abs < max_diff
+ return (x[index], y[index], index)
+
+def common_elements(list1, list2):
+ return sorted(list(set(list1).intersection(list2)))
+
+def two_points(p1, p2):
+ '''Return slope and intercept '''
+ x1, y1 = p1
+ x2, y2 = p2
+ m = (y2 - y1) / (x2 - x1)
+ b = y1 - m * x1
+ return (m, b)
+
+def select_points_diagonal(x, y, max_diff, direction):
+ ''' positive direction '''
+
+ n = len(x)
+ if n < 3:
+ return (x, y)
+
+ if direction.lower() == 'pos':
+ index_x = np.argsort(x)
+ elif direction.lower() == 'neg':
+ index_x = np.argsort(-1*x)
+ else:
+ print('%s must be pos or neg' % (direction))
+ sys.exit()
+
+ index_y = np.argsort(y)
+
+ # get lower (or upper) end point
+ idx = None
+ for i in range(2,n+1): # get common index
+ s1 = index_x[0:i]
+ s2 = index_y[0:i]
+ s = common_elements(s1, s2)
+ if s != []:
+ idx = s[0]
+ break
+ p1 = (x[idx], y[idx])
+
+ # get upper (or lower) end point
+ index_x = list(reversed(index_x)) # reverse list
+ index_y = list(reversed(index_y))
+
+ idx = None
+ for i in range(2,n+1): # get common index
+ s1 = index_x[0:i]
+ s2 = index_y[0:i]
+ s = common_elements(s1, s2)
+ if s != []:
+ idx = s[0]
+ break
+ p2 = (x[idx], y[idx])
+
+ slope, intercept = two_points(p1, p2)
+
+ yhat = get_yhat(slope, intercept, x)
+ d = y - yhat
+ d_abs = np.abs(d)
+ try:
+ index = d_abs < max_diff
+ except RuntimeWarning:
+ print(d_abs)
+ print(max_diff)
+ sys.exit()
+
+ return (x[index], y[index], index)
+
+
+def correlation_is_significant(r, p, glb_param_dict):
+ return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF'])
+
+
+def subset_list(lst, bool_index):
+ if len(lst) != len(bool_index):
+ print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index))
+ sys.exit()
+
+ n = len(lst)
+ result = []
+ for i in range(n):
+ if bool_index[i] == True:
+ result.append(lst[i])
+ return result
+
+
+def print_dict(d, glb_param_dict):
+ agi2name_dict = glb_param_dict['name_conversion_dict']
+ curr_date = datetime.now().strftime('%Y%m%d') # add date to the end of each line, for future reference or filtering
+ if d['type'] == 'two-way':
+ gene_id = d['target']
+ head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ gene_id = d['TF']
+ head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ d2 = d['significant']
+ if 'all' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.')
+ s += '\t' + curr_date
+ print(head + s)
+ sys.stdout.flush() # flush to stdout, so we can see the results immedialtely.
+ if 'user' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['user']['score'], 'user', ' '.join(d2['user']['signal_set']), ' '.join(d2['user']['chip_id']), '.')
+ print(head + s)
+ if 'pos' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['pos']['score'], 'pos', ' '.join(d2['pos']['signal_set']), ' '.join(d2['pos']['chip_id']), '.')
+ print(head + s)
+ if 'neg' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['neg']['score'], 'neg', ' '.join(d2['neg']['signal_set']), ' '.join(d2['neg']['chip_id']), '.')
+ print(head + s)
+ if 'mix' in d2:
+ n = len(d2['mix']['score'])
+ for i in range(n):
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i])
+ s += '\t' + curr_date
+ print(head + s)
+ sys.stdout.flush() # flush to stdout, so we can see the results immedialtely.
+
+
+def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict):
+ '''
+
+ Check if target has relationship with each of TFs.
+
+ tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST)
+
+ Return a list of dictionaries. Each dictionary has the following format:
+
+ 'type' :'two-way'
+ 'target':''
+ 'TF' :''
+ 'significant': {
+ 'all': {'signal_set':[], score=2, chip_id:''}
+ 'pos_direction':{'signal_set':[], score:.0, chip_id:''}
+ 'neg_direction':{'signal_set':[], score:.1, chip_id:''}
+ 'user_defined': {'signal_set':[], score:.3, chip_id:''}
+ 'mix': {'signal_set':[], score:[.7,-.5], chip_id:''}
+ }
+
+ '''
+
+ result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair
+
+ target_gene_id = target
+ all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples. TBD, can be glb_param_dict['USER_CONDITION_LIST']
+ logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values
+ #target_elst = get_gene_expression(target_gene_id, all_cond_lst, expr_dict, takelog=logrithmize) # a list of gene expression levels
+
+ if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything
+ return result_dict_lst
+
+ for tf_gene_id in sorted(tf_dict.keys()): # y is a TF gene id
+
+ chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000
+
+ if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything
+ continue
+
+ # get gene expression profiles for target and TF. If in a RNA-seq sample, both target and TF is 0, then this sample is ignored.
+ target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize)
+
+ r, p = stat.pearsonr(target_elst, tf_elst)
+
+ d = {}
+ d['target'] = target_gene_id
+ d['TF'] = tf_gene_id
+ d['type'] = 'two-way'
+ d['significant'] = {}
+
+ all_good = False
+ if correlation_is_significant(r, p, glb_param_dict):
+ d['significant']['all'] = {}
+ d['significant']['all']['signal_set'] = clist # a list of sample IDs, returned by get_gene_expression3
+ d['significant']['all']['score'] = r
+ d['significant']['all']['chip_id'] = chip_id
+ all_good = True
+
+ user_cond_lst = glb_param_dict['USER_CONDITION_LIST']
+ if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []:
+ target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize)
+
+ r, p = stat.pearsonr(target_elst_user, tf_elst_user)
+ if correlation_is_significant(r, p, glb_param_dict):
+ d['significant']['user'] = {}
+ d['significant']['user']['signal_set'] = user_cond_lst
+ d['significant']['user']['score'] = r
+ d['significant']['user']['chip_id'] = chip_id
+
+ # obsolete
+ max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF']
+ if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES':
+ aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos')
+ r_pos, p_pos = stat.pearsonr(aa, bb)
+ if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION:
+ d['significant']['pos'] = {}
+ d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos)
+ d['significant']['pos']['score'] = r_pos
+ d['significant']['pos']['chip_id'] = chip_id
+
+ # obsolete
+ if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES':
+ aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg')
+ r_neg, p_neg = stat.pearsonr(aa, bb)
+ if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION:
+ d['significant']['neg'] = {}
+ d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg)
+ d['significant']['neg']['score'] = r_neg
+ d['significant']['neg']['chip_id'] = chip_id
+
+ K = int(glb_param_dict['NUMBER_OF_COMPONENTS'])
+ if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results
+ if K == 2: # for now consider two components
+ #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst)))
+ #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist)))
+ index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components
+ if msg != 'IGNORE':
+ aa = target_elst[index1]
+ bb = tf_elst[index1]
+ r_mix1, p_mix1 = stat.pearsonr(aa, bb)
+ aa = target_elst[index2]
+ bb = tf_elst[index2]
+ r_mix2, p_mix2 = stat.pearsonr(aa, bb)
+ #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2))
+ flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict)
+ flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict)
+ if flag1 or flag2:
+ d['significant']['mix'] = {}
+ d['significant']['mix']['signal_set'] = []
+ d['significant']['mix']['score'] = []
+ d['significant']['mix']['chip_id'] = chip_id
+ if flag1:
+ d['significant']['mix']['signal_set'].append(subset_list(clist, index1))
+ d['significant']['mix']['score'].append(r_mix1)
+ if flag2:
+ d['significant']['mix']['signal_set'].append(subset_list(clist, index2))
+ d['significant']['mix']['score'].append(r_mix2)
+
+ if K == 3: # three components
+ aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components
+ if msg != 'IGNORE':
+ r_mix1, p_mix1 = stat.pearsonr(aa1, bb1)
+ r_mix2, p_mix2 = stat.pearsonr(aa2, bb2)
+ r_mix3, p_mix3 = stat.pearsonr(aa3, bb3)
+ #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id))
+ #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1))
+ #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2))
+ #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3))
+ #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3)))
+ min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS'])
+ flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points
+ flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points
+ flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points
+ if flag1 or flag2 or flag3:
+ d['significant']['mix'] = {}
+ d['significant']['mix']['signal_set'] = []
+ d['significant']['mix']['score'] = []
+ d['significant']['mix']['chip_id'] = chip_id
+ d['significant']['mix']['message'] = []
+ if flag1:
+ d['significant']['mix']['signal_set'].append(cond1)
+ d['significant']['mix']['score'].append(r_mix1)
+ d['significant']['mix']['message'].append(msg)
+ if flag2:
+ d['significant']['mix']['signal_set'].append(cond2)
+ d['significant']['mix']['score'].append(r_mix2)
+ d['significant']['mix']['message'].append(msg)
+ if flag3:
+ d['significant']['mix']['signal_set'].append(cond3)
+ d['significant']['mix']['score'].append(r_mix3)
+ d['significant']['mix']['message'].append(msg)
+
+ if len(d['significant']) > 0: # significant edges exist
+ print_dict(d, glb_param_dict)
+ #result_dict_lst.append(d)
+
+ return result_dict_lst
+
+
+def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict):
+ ''' TBD '''
+ return []
+
+
+def establish_edges(expr_dict, expr_info_dict, bind_dict, bind_info_dict, input_dict, glb_param_dict):
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split()
+ gene_lst = get_gene_list(glb_param_dict['GENE_LIST'])
+ final_gene_lst = list(set(high_gene_lst)) # unique genes
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ update_global_param_dict(glb_param_dict, expr_info_dict)
+ result_d = {'two_way_edges':{}, 'three_way_edges':{}}
+ for g in final_gene_lst:
+ tf_dict = get_tf(g, bind_dict, bind_info_dict, input_dict, glb_param_dict)
+ if len(tf_dict) > 0:
+ key = g
+ if glb_param_dict['TWO_WAY'] == 'YES':
+ two_dict = two_way(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict)
+ result_d['two_way_edges'][key] = two_dict
+ if glb_param_dict['THREE_WAY'] == 'YES':
+ three_dict = three_way(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict)
+ result_d['three_way_edges'][key] = three_dict
+
+ return result_d
+
+
+def dumpclean(obj):
+ '''
+ show dictionary content, recursively
+ '''
+ if type(obj) == dict:
+ for k, v in obj.items():
+ if hasattr(v, '__iter__'):
+ print(k)
+ dumpclean(v)
+ else:
+ print('%s : %s' % (k, v))
+ elif type(obj) == list:
+ for v in obj:
+ if hasattr(v, '__iter__'):
+ dumpclean(v)
+ else:
+ print(v)
+ else:
+ print(obj)
+
+
+# obsolete
+def print_dict_list(dict_lst, agi2name_dict):
+ for d in dict_lst:
+ #dumpclean(d)
+ if d['type'] == 'two-way':
+ gene_id = d['target']
+ head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ gene_id = d['TF']
+ head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ d2 = d['significant']
+ if 'all' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']))
+ print(head + s)
+ if 'user' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['user']['score'], 'user', ' '.join(d2['user']['signal_set']), ' '.join(d2['user']['chip_id']))
+ print(head + s)
+ if 'pos' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['pos']['score'], 'pos', ' '.join(d2['pos']['signal_set']), ' '.join(d2['pos']['chip_id']))
+ print(head + s)
+ if 'neg' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['neg']['score'], 'neg', ' '.join(d2['neg']['signal_set']), ' '.join(d2['neg']['chip_id']))
+ print(head + s)
+ if 'mix' in d2:
+ n = len(d2['mix']['score'])
+ for i in range(n):
+ s = '%4.2f\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']))
+ print(head + s)
+
+def print_result(d, agi2name_dict):
+ for k in d:
+ print(k) # two-way or three-way
+ d2 = d[k]
+ for k2 in d2: # k2 is a gene
+ dlst = d2[k2]
+ print_dict_list(dlst, agi2name_dict)
+
+
+
+########## main ##################################################
+r.r['options'](warn=-1) # supress warning message from rpy2
+warnings.filterwarnings("ignore")
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+glb_param_dict['name_conversion_dict'] = agi2name_dict
+#print('Read expression data')
+expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX'])
+#print('DEBUG at2g07754 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
+expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO'])
+#print('Read binding data')
+bind_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+input_dict = read_matrix_data(glb_param_dict['INPUT_MATRIX']) # newly added, for comparing with bw files
+#print('Establish edges')
+edge_d = establish_edges(expr_dict, expr_info_dict, bind_dict, bind_info_dict, input_dict, glb_param_dict)
+#print_result(edge_d, agi2name_dict)