From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/make_target_tf.py | 290 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 Code/make_target_tf.py (limited to 'Code/make_target_tf.py') diff --git a/Code/make_target_tf.py b/Code/make_target_tf.py new file mode 100644 index 0000000..1cb5147 --- /dev/null +++ b/Code/make_target_tf.py @@ -0,0 +1,290 @@ +# Usage: python make_target_tf.py parameter_for_net.txt > target_tf.txt +# +# Purpose: Make a target tfs file: each line is 'Target TF1 Condition.list'. +# See ../Data/information/target_tf.txt for an example. +# +# Created on 17 JAN 2017, hui +# Last modified on 16 Mar 2017, slcu, hui +# Last modified on 5 Aug 2019, zjnu, hui +# Last modified on 9 Oct 2019, zjnu, hui +# Last modified on 22 Nov 2019, zjnu, hui [include binding information from two sources: target_tf.txt.20170629_143000 (results I made when I was at SLCU between 2016 and 2017) and target_tf_agris.txt] + +import sys, os, operator, itertools +import numpy as np +from param4net import make_global_param_dict + +#################################### +DATA_SYMBOL = '@' +SIGNAL_INPUT_RATIO_TAU = 1.5 +#################################### + +def read_matrix_data(fname): + ''' + fname - a file, first line is head, first column is row name. + ''' + + lineno = 0 + colid = [] + rowid = [] + d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...} + d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...} + d3 = {} # {gene1: [], gene2: [], ...} + d4 = {} # {cond1:[], cond2:[], ...} + + f = open(fname) + lines = f.readlines() + f.close() + + head_line = lines[0].strip() + lst = head_line.split() + colid = lst[1:] + + for c in colid: + d2[c] = {} + d4[c] = [] + + for line in lines[1:]: + line = line.strip() + lst = line.split() + g = lst[0] + rowid.append(g) + d[g] = {} + levels = lst[1:] + if len(levels) != len(colid): + print('Incomplete columns at row %s' % (g)) + sys.exit() + + d3[g] = [] + for i in range(len(colid)): + c = colid[i] + d[g][c] = float(levels[i]) + d2[c][g] = float(levels[i]) + d3[g].append(float(levels[i])) + d4[c].append(float(levels[i])) + lineno += 1 + + d_return = {} + d_return['xy'] = d # first gene, then condition + d_return['yx'] = d2 # first condition, then gene + d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row + d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column + d_return['nrow'] = lineno - 1 + d_return['ncol'] = len(colid) + d_return['rowid'] = rowid + d_return['colid'] = colid + + d4_sorted = {} + for k in d4: + d4_sorted[k] = sorted(d4[k], reverse=True) # largest numbers on the top + d_return['yy_sorted'] = d4_sorted + + return d_return + + +def get_value(s, delimit): + lst = s.split(delimit) + return lst[1].strip() + + +def read_info_data(fname): + ''' Read ChIP-seq data information ''' + + if not os.path.exists(fname): + print('%s not exists.' % (fname) ) + sys.exit() + + d = {'ID_LIST':[]} + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#') or line.startswith('%'): + continue + if line.startswith(DATA_SYMBOL): + s = line[line.rfind(DATA_SYMBOL[-1])+1:] + s = s.strip() + if s in d: + print('make_target_tf: ID %s duplicate' % (s)) + sys.exit() + d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} + d['ID_LIST'].append(s) + if line.startswith('DESCRIPTION:'): + d[s]['DESCRIPTION'] = get_value(line, ':') + elif line.startswith('PROTEN_NAME:'): + d[s]['PROTEN_NAME'] = get_value(line, ':') + elif line.startswith('PROTEIN_ID:'): + d[s]['PROTEIN_ID'] = get_value(line, ':') + elif line.startswith('DATA_NAME:'): + d[s]['DATA_NAME'] = get_value(line, ':') + elif line.startswith('DATA_FORMAT:'): + d[s]['DATA_FORMAT'] = get_value(line, ':') + elif line.startswith('LOCATION:'): + d[s]['LOCATION'] = get_value(line, ':') + elif line.startswith('NOTE:'): + d[s]['NOTE'] = get_value(line, ':') + + return d + + +def get_gene_list(fname): + result = [] + f = open(fname) + for line in f: + line = line.strip() + lst = line.split() + result.append(lst[0]) + f.close() + return result + + +def get_threshold2(lst, glb_param_dict): + x = np.array(lst) + x = x[x > 0] + max_num = int(glb_param_dict['MAX_NUM_TARGETS']) # max number of targets for a protein + percent = float(glb_param_dict['OVERFLOW_TARGETS_PERCENTAGE']) # if we have more targets than the max number, then include this percent of exceeding targets + n = len(x) + if n < max_num: + return x[-1] + else: # include some overflowing targets, but not all + overflow = n - max_num + keep = int(overflow * percent) + index = keep + max_num + return x[index] + + +def convert_dict(d): + ''' + d = {tf:{cond1:[target1, target2], cond2:[...]}} + result = {target:{tf:[c1,c2], tf:[c2,c3]}, ... } + ''' + + result = {} + for k in d: # k is tf + vd = d[k] # vd is something like {cond1:[target1, target2], cond2:[...]} + for c in vd: + lst = vd[c] # a list of targets + for x in lst: # x is a target + if not x in result: + result[x] = {k:[c]} + else: + if not k in result[x]: + result[x][k] = [c] + else: + result[x][k].append(c) + + return result + + +def get_tf(bind_dict, info_dict, input_dict, glb_param_dict): + + tf_dict = {} # key is TF, value is a list of targets:[target1, target2, target3, ...] + #input_cond = input_dict['colid'] + if len(input_dict) > 0: + input_cond = input_dict['colid'][0] # use the first column of INPUT matrix as input (improve). INPUT is used for format BW. + else: + input_cond = 'NA' + + gene_id = np.array( bind_dict['rowid'] ) + + for c in bind_dict['colid']: # check each column (protein) in binding matrix. Find the protein's targets. + #print(c) + g2 = info_dict[c]['PROTEIN_ID'] # g2 is TF + bind_val = np.array( bind_dict['yy'][c] ) # a column of values + + if info_dict[c]['DATA_FORMAT'].upper() == 'BW': # require more consideration in the future + input_val = np.array( input_dict['yy'][input_cond] ) + index = np.logical_and( np.logical_and(input_val > 0, input_val < 10000), (bind_val / input_val) > SIGNAL_INPUT_RATIO_TAU) # ignore intensities greater than 10000 as these are definitely noise + + elif info_dict[c]['DATA_FORMAT'].upper() == 'NARROWPEAK': + tau = get_threshold2(bind_dict['yy_sorted'][c], glb_param_dict) + index = bind_val >= tau + else: + print('make_target_tf: Data format %s not recognised. Only bw and narrowPeak are valid.' % (info_dict[c]['DATA_FORMAT'])) + sys.exit() + + target_gene_id = gene_id[index] + if g2 != '' and g2 != 'id_unknown': + if not g2 in tf_dict: + tf_dict[g2] = {c:list(target_gene_id)} + else: + tf_dict[g2][c] = list(target_gene_id) + + # tf_dict is a bit complicated: key is TF, value is a dictionary + # where its key is condition, and value is a list of target genes. + # It basically say this TF under condition c binds to a list of + # target genes. + d = convert_dict(tf_dict) + return d + + +def augment_dict(d, target, tf, cond_lst): + ''' Enlarge d ''' + if not target in d: + d[target] = {tf:cond_lst} + else: + if not tf in d[target]: + d[target][tf] = cond_lst + else: + cond_lst.extend(d[target][tf]) + d[target][tf] = sorted(list(set(cond_lst))) + + +def target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict): + ''' Print lines in this format: target TF ChIP-seq conditions, e.g., ../Data/information/target_tf.txt + For example, 'AT1G01270 AT3G46640 C0001000008426 C0001000008427 C0001000008428' + ''' + d = get_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict) + # d has the following format {target:{tf1:[c1,c2], tf2:[c2,c3]}, ... } + + # augment d with information from ../Data/information/target_tf_agris.txt and ../Data/information/target_tf.txt.20170629_143000 + if os.path.exists('../Data/information/target_tf_agris.txt'): + f = open('../Data/information/target_tf_agris.txt') + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + lst = line.split('\t') + if len(lst) == 3: + target0 = lst[0] + tf0 = lst[1] + cond_lst0 = lst[2].split() + augment_dict(d, target0, tf0, cond_lst0) + + if os.path.exists('../Data/information/target_tf.txt.20170629_143000'): + f = open('../Data/information/target_tf.txt.20170629_143000') + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + lst = line.split('\t') + if len(lst) == 3: + target0 = lst[0] + tf0 = lst[1] + cond_lst0 = lst[2].split() + augment_dict(d, target0, tf0, cond_lst0) + + for target in sorted(d.keys()): + tf_d = d[target] + if len(tf_d) > 0: + for tf in sorted(tf_d.keys()): + cond_lst = sorted(list(set(tf_d[tf]))) + #if len(cond_lst) > 1 and 'C0000000000001' in cond_lst: # C0000000000001 is for binding evidence from agris + # cond_lst.remove('C0000000000001') + print('%s\t%s\t%s' % (target, tf, ' '.join(cond_lst) ) ) + + +########## main ################################################## +param_file = sys.argv[1] # a single prameter file parameter_for_net.txt +glb_param_dict = make_global_param_dict(param_file) +#print('Read binding data ...') +bind_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX']) +bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO']) + +if os.path.exists(glb_param_dict['INPUT_MATRIX']): + input_dict = read_matrix_data(glb_param_dict['INPUT_MATRIX']) # for comparing with bw files +else: + input_dict = {} + +#print('Make target TF lines ...') +target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict) -- cgit v1.2.1