diff options
Diffstat (limited to 'Code/get_binding.py')
-rw-r--r-- | Code/get_binding.py | 384 |
1 files changed, 384 insertions, 0 deletions
diff --git a/Code/get_binding.py b/Code/get_binding.py new file mode 100644 index 0000000..e4087ca --- /dev/null +++ b/Code/get_binding.py @@ -0,0 +1,384 @@ +# Usage: python get_binding.py parameter_for_buildCmatrix.txt +# +# Manually change CHR_INFO in parameter_for_buildCmatrix.txt if +# your organism is not Arabidopsis. Set REBUILD_LIST in +# parameter_for_buildCmatrix.txt if you only want to make +# binding column for a few ChIP-seq IDs (useful when adding new +# ChIP-seq data but not wanting to re-generate existing binding +# files). +# +# +# Purpose: make individual binding column files, one for each ChIP-seq +# ID. These files will be combined by buildCmatrix.py into a +# big binding matrix, binding.txt. +# +# A typical column file looks like: +# +# gene_id C0003000001450 +# AT1G01010 0 +# AT1G01020 0 +# AT1G01030 0 +# AT1G01040 0 +# AT1G01046 0 +# AT1G01050 0 +# AT1G01060 0 +# ... +# +# Last modified 5 APR 2017 slcu hui +# Last modified 4 AUG 2019 zjnu hui + + +import sys, os, operator, bisect +import numpy as np +import pyBigWig +from datetime import datetime + + +#################################### +GLB_PARAM_SYMBOL = '%%' +LCL_PARAM_SYMBOL = '%' +DATA_SYMBOL = '@' +#################################### + +def get_key_value(s): + lst = s.split('=') + k, v = lst[0], lst[1] + return (k.strip(), v.strip()) + + +def get_value(s, delimit): + lst = s.split(delimit, 1) + return lst[1].strip() + + +def make_chromosome_dict(fname): + f = open(fname) + lines = f.readlines() + f.close() + d = {} + for line in lines: + line = line.strip() + if line != '' and not line.startswith('#'): + lst = line.split('\t') + k = lst[0] + v = lst[1] + d[k] = int(v) + return d + + +def make_global_param_dict(fname): + f = open(fname) + d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change + for line in f: + line = line.strip() + if line.startswith(GLB_PARAM_SYMBOL): + s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] + lst = s.split('\t') # separate items by TAB + for x in lst: + if x != '': + k, v = get_key_value(x) + if k == 'REBUILD_LIST' and v.lower() != 'all': + d[k] = v.split() # make a list and rewrite d[k] + elif k == 'CHR_INFO' and os.path.exists(v): + d[k] = make_chromosome_dict(v) + else: + d[k] = v + + if len(d['CHR_INFO']) == 0: + print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.') + sys.exit() + + f.close() + return d + + +# for each dataset, make a column gene_id, sample_id +def make_chip_data(fname): + ''' fname - a narrowPeak file + given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength). + ''' + + d = {} + if not os.path.exists(fname): + print('get_binding: cannot find file %s' % (fname)) + sys.exit() + + f = open(fname) + lines = f.readlines() + f.close() + if not check_valid_peak_line(lines[0]): # very basic check + print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname)) + return d + for line in lines: + line = line.strip() + lst = line.split() + c = lst[0] + + rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it. + if rindex != -1: + c = c[rindex+1:].strip() + + ss = int(lst[1]) + ee = int(lst[2]) + strength = lst[4] # the 5th column is used as strength + if not c in d: + d[c] = [(ss, ee, strength)] + else: + d[c].append((ss, ee, strength)) + + for k in d: + d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position + return d + + +def make_data_dict(fname): + ''' fname is paramter_for_buildCmatrix.txt ''' + + d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100 + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#'): + continue + if line.startswith(DATA_SYMBOL): + s = line[line.rfind(DATA_SYMBOL[-1])+1:] + s = s.strip() + if s in d: + print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s)) + sys.exit() + d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} + if line.startswith('DESCRIPTION:'): + d[s]['DESCRIPTION'] = get_value(line, ':') + elif line.startswith('PROTEN_NAME:'): + d[s]['PROTEN_NAME'] = get_value(line, ':') + elif line.startswith('PROTEIN_ID:'): + d[s]['PROTEIN_ID'] = get_value(line, ':') + elif line.startswith('DATA_NAME:'): + d[s]['DATA_NAME'] = get_value(line, ':') + elif line.startswith('DATA_FORMAT:'): + d[s]['DATA_FORMAT'] = get_value(line, ':') + elif line.startswith('LOCATION:'): + d[s]['LOCATION'] = get_value(line, ':') + if os.path.exists(d[s]['LOCATION']): + d['ID_LIST'].append(s) + else: + print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s. Ignore this ID.' % (d[s]['LOCATION'], s)) + elif line.startswith('NOTE:'): + d[s]['NOTE'] = get_value(line, ':') + elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): + make_local_parameter(d[s]['PARAM'], line) + + return d + + +def get_interval(c, s, e, o, flank, chr_info_dict): + ''' + o --- + positive strand, don't include gene body + ++ positive strand, include gene body + - negative strand, don't include gene body + -- negative strand, include gene body + * include both sides and gene body + ''' + + if o == '+': + return (np.max([0, s-flank]), s) + elif o == '++': # also include gene body + return (np.max([0, s-flank]), e) + elif o == '-': + return (e, np.min([e+flank, chr_info_dict[c]])) + elif o == '--': + return (s, np.min([e+flank, chr_info_dict[c]])) + elif o == '*': # both sides and gene body + return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) ) + return (-1, -1) # should not be here + + +def check_valid_peak_line(s): + s = s.strip() + lst = s.split() + if lst[0].isdigit() or lst[0].lower().startswith('chr'): + return True + return False + + +def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict): + ''' check with a given interval has intersection with peaks from a ChIP-seq ''' + if len(chip_dict) == 0 or not chromosome in chip_dict: + return [] + + result = [] + lst = chip_dict[chromosome] # get a list of intervals in that chromosome + n = len(lst) + slst, elst, strength_lst = zip(*lst) # make three sub-lists + index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position + index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position + sublst = lst[index1:index2] # these intervals potentially have intersections with given interval + #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2)) + for t in sublst: + ss = t[0] + ee = t[1] + strength = t[2] + if start_pos <= ee and end_pos >= ss: + result.append(int(strength)) + #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee)) + return result + + +def make_o(c, glb_param_dict): + ''' make orientation ''' + ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body + both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES' + if both_side: + return '*' + if ig: + return 2*c # also include gene body + return c + + +def calc_intensity(t, bw, norm_by_gene_length=0): + ''' + t - a tuple, chromosome, start and end position + ''' + chromosome = t[0] + start = t[1] + end = t[2] + if end < start: + print('get_binding.py calc_intensity: start position must be less than end position') + sys.exit() + + I = bw.intervals(chromosome, start, end) # a list of intervals + + sum_area = 0.0 + if I: # is I is not empty + for t in I: + s0 = max(start, t[0]) + e0 = min(end, t[1]) + h = t[2] + #print('=== %d %d %g' % (s0, e0, h)) + sum_area += h * (e0 - s0) + + if norm_by_gene_length != 0: + return 1000.0 * sum_area / (end - start) + else: + return 1.0 * sum_area + + +def get_update_date(s): + index = s.find('update:') + if index < 0: + return None + result = s[s.rfind('update:')+7:].strip() + if result.isdigit() and len(result) == 6: + return result + else: + return '00000000' + + +def make_table(gene_file, data_dict, glb_param_dict): + ''' + Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional). + ''' + + if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all + id_lst = data_dict['ID_LIST'] + else: + id_lst = glb_param_dict['REBUILD_LIST'] + for c in id_lst: + if not c in data_dict['ID_LIST']: + print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c)) + sys.exit() + + chip_data_dict = {} + for myid in id_lst: + chip_file = data_dict[myid]['LOCATION'] + + if not os.path.exists(chip_file): + print('get_binding: file %s dose not exists.' % (chip_file)) + sys.exit() + + if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK': + chip_data = make_chip_data(chip_file) + + elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW': + chip_data = pyBigWig.open(chip_file) + + else: + print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT'])) + sys.exit() + + chip_data_dict[myid] = chip_data + + + f = open(gene_file) + lines = f.readlines() # lines contain all lines in gene_file.txt + f.close() + + dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here + if not os.path.isdir(dest): + os.makedirs(dest) + + for myid in id_lst: # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt + #print('Processing %s' % (myid)) + fname = os.path.join(dest, myid + '.txt') + if os.path.exists(fname): # this file has been created before. Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date. + file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d') + file_update_time = get_update_date(data_dict[myid]['NOTE']) + if file_update_time == None: + continue + if file_update_time <= file_modification_time: + continue + + f = open(fname, 'w') + content = 'gene_id\t%s\n' % (myid) + + for line in lines: + line = line.strip() + lst = line.split('\t') + gene_id = lst[0] + gene_name = lst[1] + c = lst[2] # chromosome, a letter, e.g., '1', '2', etc. + s = int(lst[3]) + e = int(lst[4]) + o = make_o(lst[5], glb_param_dict) # gene oritentation, + or - + + flank = int(glb_param_dict['TARGET_RANGE']) + interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream + + s0 = gene_id + + if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK': + intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid]) + if intersection_list != []: + s0 += '\t' + str(np.max(intersection_list)) # get the maximum value in the interval [change] + else: + s0 += '\t' + '0' + + if data_dict[myid]['DATA_FORMAT'].upper() == 'BW': + t = (c, interval[0], interval[1]) + try: + z = calc_intensity(t, chip_data_dict[myid], 1) + except RuntimeError: + z = -1.0 + s0 += '\t' + '%4.2f' % (z) + + content += s0 + '\n' + + f.write(content) + f.close() + + # close bw files + for myid in id_lst: + if data_dict[myid]['DATA_FORMAT'].upper() == 'BW': + chip_data_dict[myid].close() + + + +## main +param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt +global_param_dict = make_global_param_dict(param_file) +data_dict = make_data_dict(param_file) +print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.' + % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST']))) +make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict) |