brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/make_target_tf.py
1 files changed, 290 insertions, 0 deletions
diff --git a/Code/make_target_tf.py b/Code/make_target_tf.py
new file mode 100644
index 0000000..1cb5147
--- /dev/null
+++ b/Code/make_target_tf.py
@@ -0,0 +1,290 @@
+# Usage: python make_target_tf.py parameter_for_net.txt > target_tf.txt
+#
+# Purpose: Make a target tfs file: each line is 'Target TF1 Condition.list'.
+# See ../Data/information/target_tf.txt for an example.
+#
+# Created on 17 JAN 2017, hui
+# Last modified on 16 Mar 2017, slcu, hui
+# Last modified on 5 Aug 2019, zjnu, hui
+# Last modified on 9 Oct 2019, zjnu, hui
+# Last modified on 22 Nov 2019, zjnu, hui [include binding information from two sources: target_tf.txt.20170629_143000 (results I made when I was at SLCU between 2016 and 2017) and target_tf_agris.txt]
+
+import sys, os, operator, itertools
+import numpy as np
+from param4net import make_global_param_dict
+
+####################################
+DATA_SYMBOL         = '@'
+SIGNAL_INPUT_RATIO_TAU = 1.5
+####################################
+
+def read_matrix_data(fname):
+    ''' 
+    fname - a file, first line is head, first column is row name.
+    '''
+    
+    lineno = 0
+    colid = []
+    rowid = []
+    d =  {}  # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+    d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+    d3 = {} # {gene1: [], gene2: [], ...}
+    d4 = {} # {cond1:[], cond2:[], ...}
+
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+
+    head_line = lines[0].strip()
+    lst = head_line.split()
+    colid = lst[1:]
+
+    for c in colid:
+        d2[c] = {}
+        d4[c] = []
+    
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split()
+        g = lst[0]
+        rowid.append(g)
+        d[g] = {}
+        levels = lst[1:]
+        if len(levels) != len(colid):
+            print('Incomplete columns at row %s' % (g))
+            sys.exit()
+            
+        d3[g] = []
+        for i in range(len(colid)):
+            c = colid[i]
+            d[g][c]  = float(levels[i])
+            d2[c][g] = float(levels[i])
+            d3[g].append(float(levels[i]))
+            d4[c].append(float(levels[i]))
+        lineno += 1
+
+    d_return = {}
+    d_return['xy'] = d  # first gene, then condition
+    d_return['yx'] = d2 # first condition, then gene
+    d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+    d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+    d_return['nrow'] = lineno - 1
+    d_return['ncol'] = len(colid)
+    d_return['rowid'] = rowid
+    d_return['colid'] = colid    
+
+    d4_sorted = {}
+    for k in d4:
+        d4_sorted[k] = sorted(d4[k], reverse=True) # largest numbers on the top
+    d_return['yy_sorted'] = d4_sorted
+
+    return d_return
+
+
+def get_value(s, delimit):
+    lst = s.split(delimit)
+    return lst[1].strip()
+
+
+def read_info_data(fname):
+    ''' Read ChIP-seq data information '''
+
+    if not os.path.exists(fname):
+        print('%s not exists.' % (fname) )
+        sys.exit()
+        
+    d = {'ID_LIST':[]}
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#') or line.startswith('%'):
+            continue
+        if line.startswith(DATA_SYMBOL):
+            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+            s = s.strip()
+            if s in d:
+                print('make_target_tf: ID %s duplicate' % (s))
+                sys.exit()
+            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+            d['ID_LIST'].append(s)
+        if line.startswith('DESCRIPTION:'):
+            d[s]['DESCRIPTION'] = get_value(line, ':')
+        elif line.startswith('PROTEN_NAME:'):
+            d[s]['PROTEN_NAME'] = get_value(line, ':')
+        elif line.startswith('PROTEIN_ID:'):
+            d[s]['PROTEIN_ID'] = get_value(line, ':')
+        elif line.startswith('DATA_NAME:'):
+            d[s]['DATA_NAME'] = get_value(line, ':')                        
+        elif line.startswith('DATA_FORMAT:'):
+            d[s]['DATA_FORMAT'] = get_value(line, ':')
+        elif line.startswith('LOCATION:'):
+            d[s]['LOCATION'] = get_value(line, ':')
+        elif line.startswith('NOTE:'):
+            d[s]['NOTE'] = get_value(line, ':')
+
+    return d
+
+
+def get_gene_list(fname):
+    result = []
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        lst = line.split()
+        result.append(lst[0])
+    f.close()
+    return result
+
+
+def get_threshold2(lst, glb_param_dict):
+    x = np.array(lst)
+    x = x[x > 0]
+    max_num  = int(glb_param_dict['MAX_NUM_TARGETS']) # max number of targets for a protein
+    percent = float(glb_param_dict['OVERFLOW_TARGETS_PERCENTAGE']) # if we have more targets than the max number, then include this percent of exceeding targets
+    n = len(x)
+    if n < max_num:
+        return x[-1]
+    else: # include some overflowing targets, but not all
+        overflow = n - max_num
+        keep = int(overflow * percent)
+        index = keep + max_num
+        return x[index]
+    
+
+def convert_dict(d):
+    ''' 
+    d      = {tf:{cond1:[target1, target2], cond2:[...]}}  
+    result = {target:{tf:[c1,c2], tf:[c2,c3]}, ...  }
+    '''
+    
+    result = {}
+    for k in d:  # k is tf
+        vd = d[k] # vd is something like {cond1:[target1, target2], cond2:[...]}
+        for c in vd:
+            lst = vd[c] #  a list of targets
+            for x in lst:  # x is a target
+                if not x in result:
+                    result[x] = {k:[c]}
+                else:
+                    if not k in result[x]:
+                        result[x][k] = [c]
+                    else:
+                        result[x][k].append(c)
+
+    return result
+
+
+def get_tf(bind_dict, info_dict, input_dict, glb_param_dict):
+
+    tf_dict = {} # key is TF, value is a list of targets:[target1, target2, target3, ...]
+    #input_cond = input_dict['colid']
+    if len(input_dict) > 0:
+        input_cond = input_dict['colid'][0] # use the first column of INPUT matrix as input (improve).  INPUT is used for format BW.
+    else:
+        input_cond = 'NA'
+        
+    gene_id = np.array( bind_dict['rowid'] )
+    
+    for c in bind_dict['colid']: # check each column (protein) in binding matrix.  Find the protein's targets.
+        #print(c)
+        g2 = info_dict[c]['PROTEIN_ID'] # g2 is TF        
+        bind_val = np.array( bind_dict['yy'][c] ) # a column of values
+
+        if info_dict[c]['DATA_FORMAT'].upper() == 'BW': # require more consideration in the future
+            input_val = np.array( input_dict['yy'][input_cond] )
+            index = np.logical_and( np.logical_and(input_val > 0, input_val < 10000), (bind_val / input_val) > SIGNAL_INPUT_RATIO_TAU) # ignore intensities greater than 10000 as these are definitely noise
+                    
+        elif info_dict[c]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+            tau = get_threshold2(bind_dict['yy_sorted'][c], glb_param_dict)
+            index = bind_val >= tau
+        else:
+            print('make_target_tf: Data format %s not recognised. Only bw and narrowPeak are valid.' % (info_dict[c]['DATA_FORMAT']))
+            sys.exit()
+            
+        target_gene_id = gene_id[index]
+        if g2 != '' and g2 != 'id_unknown':
+            if not g2 in tf_dict:
+                tf_dict[g2] = {c:list(target_gene_id)}
+            else:
+                tf_dict[g2][c] = list(target_gene_id)
+
+    # tf_dict is a bit complicated: key is TF, value is a dictionary
+    # where its key is condition, and value is a list of target genes.
+    # It basically say this TF under condition c binds to a list of
+    # target genes.
+    d = convert_dict(tf_dict)
+    return d
+
+
+def augment_dict(d, target, tf, cond_lst):
+    ''' Enlarge d '''
+    if not target in d:
+        d[target] = {tf:cond_lst}
+    else:
+        if not tf in d[target]:
+            d[target][tf] = cond_lst
+        else:
+            cond_lst.extend(d[target][tf])
+            d[target][tf] = sorted(list(set(cond_lst)))
+
+
+def target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict):
+    ''' Print lines in this format: target TF ChIP-seq conditions, e.g., ../Data/information/target_tf.txt
+    For example, 'AT1G01270       AT3G46640       C0001000008426 C0001000008427 C0001000008428'
+    '''
+    d = get_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict)
+    # d has the following format {target:{tf1:[c1,c2], tf2:[c2,c3]}, ...  }
+
+    # augment d with information from ../Data/information/target_tf_agris.txt and ../Data/information/target_tf.txt.20170629_143000
+    if os.path.exists('../Data/information/target_tf_agris.txt'):
+        f = open('../Data/information/target_tf_agris.txt')
+        lines = f.readlines()
+        f.close()
+        for line in lines:
+            line = line.strip()
+            lst = line.split('\t')
+            if len(lst) == 3:
+                target0 = lst[0]
+                tf0     = lst[1]
+                cond_lst0  = lst[2].split()
+                augment_dict(d, target0, tf0, cond_lst0)
+                
+    if os.path.exists('../Data/information/target_tf.txt.20170629_143000'):
+        f = open('../Data/information/target_tf.txt.20170629_143000')
+        lines = f.readlines()
+        f.close()
+        for line in lines:
+            line = line.strip()
+            lst = line.split('\t')
+            if len(lst) == 3:
+                target0 = lst[0]
+                tf0     = lst[1]
+                cond_lst0  = lst[2].split()
+                augment_dict(d, target0, tf0, cond_lst0)
+    
+    for target in sorted(d.keys()):
+        tf_d = d[target]
+        if len(tf_d) > 0:
+            for tf in sorted(tf_d.keys()):
+                cond_lst = sorted(list(set(tf_d[tf])))
+                #if len(cond_lst) > 1 and 'C0000000000001' in cond_lst: # C0000000000001 is for binding evidence from agris
+                #    cond_lst.remove('C0000000000001') 
+                print('%s\t%s\t%s' % (target, tf, ' '.join(cond_lst) ) )
+
+
+########## main ##################################################
+param_file = sys.argv[1] # a single prameter file parameter_for_net.txt
+glb_param_dict = make_global_param_dict(param_file)
+#print('Read binding data ...')
+bind_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+
+if os.path.exists(glb_param_dict['INPUT_MATRIX']):
+    input_dict = read_matrix_data(glb_param_dict['INPUT_MATRIX']) # for comparing with bw files
+else:
+    input_dict = {}
+    
+#print('Make target TF lines ...')
+target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict)
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/make_target_tf.py