Code/get_binding.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

# Usage: python get_binding.py parameter_for_buildCmatrix.txt
#
#        Manually change CHR_INFO in parameter_for_buildCmatrix.txt if
#        your organism is not Arabidopsis.  Set REBUILD_LIST in
#        parameter_for_buildCmatrix.txt if you only want to make
#        binding column for a few ChIP-seq IDs (useful when adding new
#        ChIP-seq data but not wanting to re-generate existing binding
#        files).
#
#
# Purpose: make individual binding column files, one for each ChIP-seq
#          ID.  These files will be combined by buildCmatrix.py into a
#          big binding matrix, binding.txt.
#
# A typical column file looks like:
#
#   gene_id C0003000001450
#   AT1G01010       0
#   AT1G01020       0
#   AT1G01030       0
#   AT1G01040       0
#   AT1G01046       0
#   AT1G01050       0
#   AT1G01060       0
#   ...
#
# Last modified 5 APR 2017 slcu hui
# Last modified 4 AUG 2019 zjnu hui


import sys, os, operator, bisect
import numpy as np
import pyBigWig
from datetime import datetime


####################################
GLB_PARAM_SYMBOL    = '%%'
LCL_PARAM_SYMBOL    = '%'
DATA_SYMBOL         = '@'
####################################

def get_key_value(s):
    lst = s.split('=')
    k, v = lst[0], lst[1]
    return (k.strip(), v.strip())


def get_value(s, delimit):
    lst = s.split(delimit, 1)
    return lst[1].strip()


def make_chromosome_dict(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    for line in lines:
        line = line.strip()
        if line != '' and not line.startswith('#'):
            lst = line.split('\t')
            k = lst[0]
            v = lst[1]
            d[k] = int(v)
    return d
            

def make_global_param_dict(fname):
    f = open(fname)
    d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change
    for line in f:
        line = line.strip()
        if line.startswith(GLB_PARAM_SYMBOL):
            s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
            lst = s.split('\t')  # separate items by TAB
            for x in lst:
                if x != '':
                    k, v = get_key_value(x)
                    if k == 'REBUILD_LIST' and v.lower() != 'all':
                        d[k] = v.split() # make a list and rewrite d[k]
                    elif k == 'CHR_INFO' and os.path.exists(v):
                        d[k] = make_chromosome_dict(v)
                    else:
                        d[k] = v
                        
    if len(d['CHR_INFO']) == 0:
        print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.')
        sys.exit()
        
    f.close()
    return d


# for each dataset, make a column gene_id, sample_id
def make_chip_data(fname):
    ''' fname - a narrowPeak file
    given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength). 
    '''

    d = {}
    if not os.path.exists(fname):
        print('get_binding: cannot find file %s' % (fname))
        sys.exit()

    f = open(fname)
    lines = f.readlines()
    f.close()
    if not check_valid_peak_line(lines[0]): # very basic check
        print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname))
        return d
    for line in lines:
        line = line.strip()
        lst = line.split()
        c = lst[0]

        rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it.
        if  rindex != -1:
            c = c[rindex+1:].strip()
            
        ss = int(lst[1])
        ee = int(lst[2])
        strength = lst[4] # the 5th column is used as strength
        if not c in d:
            d[c] = [(ss, ee, strength)]
        else:
            d[c].append((ss, ee, strength))

    for k in d:
        d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
    return d


def make_data_dict(fname):
    ''' fname is paramter_for_buildCmatrix.txt '''

    d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line == '' or line.startswith('#'):
            continue
        if line.startswith(DATA_SYMBOL):
            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
            s = s.strip()
            if s in d:
                print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s))
                sys.exit()
            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
        if line.startswith('DESCRIPTION:'):
            d[s]['DESCRIPTION'] = get_value(line, ':')
        elif line.startswith('PROTEN_NAME:'):
            d[s]['PROTEN_NAME'] = get_value(line, ':')
        elif line.startswith('PROTEIN_ID:'):
            d[s]['PROTEIN_ID'] = get_value(line, ':')
        elif line.startswith('DATA_NAME:'):
            d[s]['DATA_NAME'] = get_value(line, ':')                        
        elif line.startswith('DATA_FORMAT:'):
            d[s]['DATA_FORMAT'] = get_value(line, ':')
        elif line.startswith('LOCATION:'):
            d[s]['LOCATION'] = get_value(line, ':')
            if os.path.exists(d[s]['LOCATION']):
                d['ID_LIST'].append(s)
            else:
                print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s.  Ignore this ID.' % (d[s]['LOCATION'], s))
        elif line.startswith('NOTE:'):
            d[s]['NOTE'] = get_value(line, ':')
        elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
            make_local_parameter(d[s]['PARAM'], line)

    return d


def get_interval(c, s, e, o, flank, chr_info_dict):
    '''
    o --- +  positive strand, don't include gene body
          ++ positive strand, include gene body
          -  negative strand, don't include gene body
          -- negative strand, include gene body
          *  include both sides and gene body
    '''

    if o == '+':
        return (np.max([0, s-flank]), s)
    elif o == '++': # also include gene body
        return (np.max([0, s-flank]), e)
    elif o == '-':
        return (e, np.min([e+flank, chr_info_dict[c]]))
    elif o == '--':
        return (s, np.min([e+flank, chr_info_dict[c]]))
    elif o == '*': # both sides and gene body
        return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) )
    return (-1, -1) # should not be here


def check_valid_peak_line(s):
    s = s.strip()
    lst = s.split()
    if lst[0].isdigit() or lst[0].lower().startswith('chr'):
        return True
    return False


def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
    ''' check with a given interval has intersection with peaks from a ChIP-seq '''
    if len(chip_dict) == 0 or not chromosome in chip_dict:
        return []

    result = []
    lst = chip_dict[chromosome] # get a list of intervals in that chromosome
    n = len(lst)
    slst, elst, strength_lst = zip(*lst) # make three sub-lists
    index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position
    index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position
    sublst = lst[index1:index2] # these intervals potentially have intersections with given interval
    #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
    for t in sublst:
        ss = t[0]
        ee = t[1]
        strength = t[2]
        if start_pos <= ee and end_pos >= ss:
            result.append(int(strength))
            #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
    return result


def make_o(c, glb_param_dict):
    ''' make orientation '''
    ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body
    both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES'
    if both_side:
        return '*'
    if ig:
        return 2*c # also include gene body
    return c
    

def calc_intensity(t, bw, norm_by_gene_length=0):
    '''
    t - a tuple, chromosome, start and end position
    '''
    chromosome = t[0]
    start = t[1]
    end = t[2]
    if end < start:
        print('get_binding.py calc_intensity: start position must be less than end position')
        sys.exit()

    I = bw.intervals(chromosome, start, end) # a list of intervals
    
    sum_area = 0.0
    if I: # is I is not empty
        for t in I:
            s0 = max(start, t[0])
            e0 = min(end, t[1])
            h = t[2]
            #print('=== %d %d %g' % (s0, e0, h))
            sum_area += h * (e0 - s0)

    if norm_by_gene_length != 0:
        return 1000.0 * sum_area / (end - start)
    else:
        return 1.0 * sum_area

    
def get_update_date(s):
    index = s.find('update:')
    if index < 0:
        return None
    result = s[s.rfind('update:')+7:].strip()
    if result.isdigit() and len(result) == 6:
        return result
    else:
        return '00000000'


def make_table(gene_file, data_dict, glb_param_dict):
    ''' 
    Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional).
    '''

    if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all
        id_lst = data_dict['ID_LIST']
    else:
        id_lst = glb_param_dict['REBUILD_LIST']
        for c in id_lst:
            if not c in data_dict['ID_LIST']:
                print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c))
                sys.exit()
    
    chip_data_dict = {}
    for myid in id_lst:
        chip_file = data_dict[myid]['LOCATION']

        if not os.path.exists(chip_file):
            print('get_binding: file %s dose not exists.' % (chip_file))
            sys.exit()
        
        if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
            chip_data = make_chip_data(chip_file)
            
        elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
            chip_data = pyBigWig.open(chip_file)

        else:
            print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT']))
            sys.exit()
            
        chip_data_dict[myid] = chip_data    
    

    f = open(gene_file)
    lines = f.readlines() # lines contain all lines in gene_file.txt
    f.close()

    dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here
    if not os.path.isdir(dest):
        os.makedirs(dest)
    
    for myid in id_lst:  # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt
        #print('Processing %s' % (myid))
        fname = os.path.join(dest, myid + '.txt')
        if os.path.exists(fname): # this file has been created before.  Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date.
            file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d')
            file_update_time = get_update_date(data_dict[myid]['NOTE'])
            if file_update_time == None:
                continue
            if file_update_time <= file_modification_time:
                continue
            
        f = open(fname, 'w')
        content = 'gene_id\t%s\n' % (myid)

        for line in lines:
            line = line.strip()
            lst = line.split('\t')
            gene_id   = lst[0]
            gene_name = lst[1]
            c         = lst[2] # chromosome, a letter, e.g., '1', '2', etc.
            s         = int(lst[3])
            e         = int(lst[4])
            o         = make_o(lst[5], glb_param_dict) # gene oritentation, + or -

            flank = int(glb_param_dict['TARGET_RANGE'])
            interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream

            s0 = gene_id

            if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
                intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid])
                if intersection_list != []:
                    s0 += '\t'  + str(np.max(intersection_list)) # get the maximum value in the interval [change]
                else:
                    s0 += '\t' + '0'

            if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
                t = (c, interval[0], interval[1])
                try:
                    z = calc_intensity(t, chip_data_dict[myid], 1)
                except RuntimeError:
                    z = -1.0
                s0 += '\t'  + '%4.2f' % (z)

            content += s0 + '\n'
        
        f.write(content)
        f.close()
        
    # close bw files
    for myid in id_lst:
        if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
            chip_data_dict[myid].close()


## main
param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt
global_param_dict = make_global_param_dict(param_file)
data_dict = make_data_dict(param_file)
print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.'
      % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST'])))
make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)