1 files changed, 384 insertions, 0 deletions
diff --git a/Code/get_binding.py b/Code/get_binding.py
new file mode 100644
index 0000000..e4087ca
--- /dev/null
+++ b/Code/get_binding.py
@@ -0,0 +1,384 @@
+# Usage: python get_binding.py parameter_for_buildCmatrix.txt
+#
+#        Manually change CHR_INFO in parameter_for_buildCmatrix.txt if
+#        your organism is not Arabidopsis.  Set REBUILD_LIST in
+#        parameter_for_buildCmatrix.txt if you only want to make
+#        binding column for a few ChIP-seq IDs (useful when adding new
+#        ChIP-seq data but not wanting to re-generate existing binding
+#        files).
+#
+#
+# Purpose: make individual binding column files, one for each ChIP-seq
+#          ID.  These files will be combined by buildCmatrix.py into a
+#          big binding matrix, binding.txt.
+#
+# A typical column file looks like:
+#
+#   gene_id C0003000001450
+#   AT1G01010       0
+#   AT1G01020       0
+#   AT1G01030       0
+#   AT1G01040       0
+#   AT1G01046       0
+#   AT1G01050       0
+#   AT1G01060       0
+#   ...
+#
+# Last modified 5 APR 2017 slcu hui
+# Last modified 4 AUG 2019 zjnu hui
+
+
+import sys, os, operator, bisect
+import numpy as np
+import pyBigWig
+from datetime import datetime
+
+
+####################################
+GLB_PARAM_SYMBOL    = '%%'
+LCL_PARAM_SYMBOL    = '%'
+DATA_SYMBOL         = '@'
+####################################
+
+def get_key_value(s):
+    lst = s.split('=')
+    k, v = lst[0], lst[1]
+    return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+    lst = s.split(delimit, 1)
+    return lst[1].strip()
+
+
+def make_chromosome_dict(fname):
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    d = {}
+    for line in lines:
+        line = line.strip()
+        if line != '' and not line.startswith('#'):
+            lst = line.split('\t')
+            k = lst[0]
+            v = lst[1]
+            d[k] = int(v)
+    return d
+            
+
+def make_global_param_dict(fname):
+    f = open(fname)
+    d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change
+    for line in f:
+        line = line.strip()
+        if line.startswith(GLB_PARAM_SYMBOL):
+            s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+            lst = s.split('\t')  # separate items by TAB
+            for x in lst:
+                if x != '':
+                    k, v = get_key_value(x)
+                    if k == 'REBUILD_LIST' and v.lower() != 'all':
+                        d[k] = v.split() # make a list and rewrite d[k]
+                    elif k == 'CHR_INFO' and os.path.exists(v):
+                        d[k] = make_chromosome_dict(v)
+                    else:
+                        d[k] = v
+                        
+    if len(d['CHR_INFO']) == 0:
+        print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.')
+        sys.exit()
+        
+    f.close()
+    return d
+
+
+# for each dataset, make a column gene_id, sample_id
+def make_chip_data(fname):
+    ''' fname - a narrowPeak file
+    given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength). 
+    '''
+
+    d = {}
+    if not os.path.exists(fname):
+        print('get_binding: cannot find file %s' % (fname))
+        sys.exit()
+
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    if not check_valid_peak_line(lines[0]): # very basic check
+        print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname))
+        return d
+    for line in lines:
+        line = line.strip()
+        lst = line.split()
+        c = lst[0]
+
+        rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it.
+        if  rindex != -1:
+            c = c[rindex+1:].strip()
+            
+        ss = int(lst[1])
+        ee = int(lst[2])
+        strength = lst[4] # the 5th column is used as strength
+        if not c in d:
+            d[c] = [(ss, ee, strength)]
+        else:
+            d[c].append((ss, ee, strength))
+
+    for k in d:
+        d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
+    return d
+
+
+def make_data_dict(fname):
+    ''' fname is paramter_for_buildCmatrix.txt '''
+
+    d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#'):
+            continue
+        if line.startswith(DATA_SYMBOL):
+            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+            s = s.strip()
+            if s in d:
+                print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s))
+                sys.exit()
+            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+        if line.startswith('DESCRIPTION:'):
+            d[s]['DESCRIPTION'] = get_value(line, ':')
+        elif line.startswith('PROTEN_NAME:'):
+            d[s]['PROTEN_NAME'] = get_value(line, ':')
+        elif line.startswith('PROTEIN_ID:'):
+            d[s]['PROTEIN_ID'] = get_value(line, ':')
+        elif line.startswith('DATA_NAME:'):
+            d[s]['DATA_NAME'] = get_value(line, ':')                        
+        elif line.startswith('DATA_FORMAT:'):
+            d[s]['DATA_FORMAT'] = get_value(line, ':')
+        elif line.startswith('LOCATION:'):
+            d[s]['LOCATION'] = get_value(line, ':')
+            if os.path.exists(d[s]['LOCATION']):
+                d['ID_LIST'].append(s)
+            else:
+                print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s.  Ignore this ID.' % (d[s]['LOCATION'], s))
+        elif line.startswith('NOTE:'):
+            d[s]['NOTE'] = get_value(line, ':')
+        elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+            make_local_parameter(d[s]['PARAM'], line)
+
+    return d
+
+
+def get_interval(c, s, e, o, flank, chr_info_dict):
+    '''
+    o --- +  positive strand, don't include gene body
+          ++ positive strand, include gene body
+          -  negative strand, don't include gene body
+          -- negative strand, include gene body
+          *  include both sides and gene body
+    '''
+
+    if o == '+':
+        return (np.max([0, s-flank]), s)
+    elif o == '++': # also include gene body
+        return (np.max([0, s-flank]), e)
+    elif o == '-':
+        return (e, np.min([e+flank, chr_info_dict[c]]))
+    elif o == '--':
+        return (s, np.min([e+flank, chr_info_dict[c]]))
+    elif o == '*': # both sides and gene body
+        return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) )
+    return (-1, -1) # should not be here
+
+
+def check_valid_peak_line(s):
+    s = s.strip()
+    lst = s.split()
+    if lst[0].isdigit() or lst[0].lower().startswith('chr'):
+        return True
+    return False
+
+
+def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
+    ''' check with a given interval has intersection with peaks from a ChIP-seq '''
+    if len(chip_dict) == 0 or not chromosome in chip_dict:
+        return []
+
+    result = []
+    lst = chip_dict[chromosome] # get a list of intervals in that chromosome
+    n = len(lst)
+    slst, elst, strength_lst = zip(*lst) # make three sub-lists
+    index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position
+    index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position
+    sublst = lst[index1:index2] # these intervals potentially have intersections with given interval
+    #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
+    for t in sublst:
+        ss = t[0]
+        ee = t[1]
+        strength = t[2]
+        if start_pos <= ee and end_pos >= ss:
+            result.append(int(strength))
+            #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
+    return result
+
+
+def make_o(c, glb_param_dict):
+    ''' make orientation '''
+    ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body
+    both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES'
+    if both_side:
+        return '*'
+    if ig:
+        return 2*c # also include gene body
+    return c
+    
+
+def calc_intensity(t, bw, norm_by_gene_length=0):
+    '''
+    t - a tuple, chromosome, start and end position
+    '''
+    chromosome = t[0]
+    start = t[1]
+    end = t[2]
+    if end < start:
+        print('get_binding.py calc_intensity: start position must be less than end position')
+        sys.exit()
+
+    I = bw.intervals(chromosome, start, end) # a list of intervals
+    
+    sum_area = 0.0
+    if I: # is I is not empty
+        for t in I:
+            s0 = max(start, t[0])
+            e0 = min(end, t[1])
+            h = t[2]
+            #print('=== %d %d %g' % (s0, e0, h))
+            sum_area += h * (e0 - s0)
+
+    if norm_by_gene_length != 0:
+        return 1000.0 * sum_area / (end - start)
+    else:
+        return 1.0 * sum_area
+
+    
+def get_update_date(s):
+    index = s.find('update:')
+    if index < 0:
+        return None
+    result = s[s.rfind('update:')+7:].strip()
+    if result.isdigit() and len(result) == 6:
+        return result
+    else:
+        return '00000000'
+
+
+def make_table(gene_file, data_dict, glb_param_dict):
+    ''' 
+    Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional).
+    '''
+
+    if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all
+        id_lst = data_dict['ID_LIST']
+    else:
+        id_lst = glb_param_dict['REBUILD_LIST']
+        for c in id_lst:
+            if not c in data_dict['ID_LIST']:
+                print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c))
+                sys.exit()
+    
+    chip_data_dict = {}
+    for myid in id_lst:
+        chip_file = data_dict[myid]['LOCATION']
+
+        if not os.path.exists(chip_file):
+            print('get_binding: file %s dose not exists.' % (chip_file))
+            sys.exit()
+        
+        if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+            chip_data = make_chip_data(chip_file)
+            
+        elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+            chip_data = pyBigWig.open(chip_file)
+
+        else:
+            print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT']))
+            sys.exit()
+            
+        chip_data_dict[myid] = chip_data    
+    
+
+    f = open(gene_file)
+    lines = f.readlines() # lines contain all lines in gene_file.txt
+    f.close()
+
+    dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here
+    if not os.path.isdir(dest):
+        os.makedirs(dest)
+    
+    for myid in id_lst:  # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt
+        #print('Processing %s' % (myid))
+        fname = os.path.join(dest, myid + '.txt')
+        if os.path.exists(fname): # this file has been created before.  Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date.
+            file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d')
+            file_update_time = get_update_date(data_dict[myid]['NOTE'])
+            if file_update_time == None:
+                continue
+            if file_update_time <= file_modification_time:
+                continue
+            
+        f = open(fname, 'w')
+        content = 'gene_id\t%s\n' % (myid)
+
+        for line in lines:
+            line = line.strip()
+            lst = line.split('\t')
+            gene_id   = lst[0]
+            gene_name = lst[1]
+            c         = lst[2] # chromosome, a letter, e.g., '1', '2', etc.
+            s         = int(lst[3])
+            e         = int(lst[4])
+            o         = make_o(lst[5], glb_param_dict) # gene oritentation, + or -
+
+            flank = int(glb_param_dict['TARGET_RANGE'])
+            interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream
+
+            s0 = gene_id
+
+            if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+                intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid])
+                if intersection_list != []:
+                    s0 += '\t'  + str(np.max(intersection_list)) # get the maximum value in the interval [change]
+                else:
+                    s0 += '\t' + '0'
+
+            if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+                t = (c, interval[0], interval[1])
+                try:
+                    z = calc_intensity(t, chip_data_dict[myid], 1)
+                except RuntimeError:
+                    z = -1.0
+                s0 += '\t'  + '%4.2f' % (z)
+
+            content += s0 + '\n'
+        
+        f.write(content)
+        f.close()
+        
+    # close bw files
+    for myid in id_lst:
+        if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+            chip_data_dict[myid].close()
+
+
+
+## main
+param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.'
+      % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST'])))
+make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)