# Usage: python get_binding.py parameter_for_buildCmatrix.txt # # Manually change CHR_INFO in parameter_for_buildCmatrix.txt if # your organism is not Arabidopsis. Set REBUILD_LIST in # parameter_for_buildCmatrix.txt if you only want to make # binding column for a few ChIP-seq IDs (useful when adding new # ChIP-seq data but not wanting to re-generate existing binding # files). # # # Purpose: make individual binding column files, one for each ChIP-seq # ID. These files will be combined by buildCmatrix.py into a # big binding matrix, binding.txt. # # A typical column file looks like: # # gene_id C0003000001450 # AT1G01010 0 # AT1G01020 0 # AT1G01030 0 # AT1G01040 0 # AT1G01046 0 # AT1G01050 0 # AT1G01060 0 # ... # # Last modified 5 APR 2017 slcu hui # Last modified 4 AUG 2019 zjnu hui import sys, os, operator, bisect import numpy as np import pyBigWig from datetime import datetime #################################### GLB_PARAM_SYMBOL = '%%' LCL_PARAM_SYMBOL = '%' DATA_SYMBOL = '@' #################################### def get_key_value(s): lst = s.split('=') k, v = lst[0], lst[1] return (k.strip(), v.strip()) def get_value(s, delimit): lst = s.split(delimit, 1) return lst[1].strip() def make_chromosome_dict(fname): f = open(fname) lines = f.readlines() f.close() d = {} for line in lines: line = line.strip() if line != '' and not line.startswith('#'): lst = line.split('\t') k = lst[0] v = lst[1] d[k] = int(v) return d def make_global_param_dict(fname): f = open(fname) d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change for line in f: line = line.strip() if line.startswith(GLB_PARAM_SYMBOL): s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] lst = s.split('\t') # separate items by TAB for x in lst: if x != '': k, v = get_key_value(x) if k == 'REBUILD_LIST' and v.lower() != 'all': d[k] = v.split() # make a list and rewrite d[k] elif k == 'CHR_INFO' and os.path.exists(v): d[k] = make_chromosome_dict(v) else: d[k] = v if len(d['CHR_INFO']) == 0: print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.') sys.exit() f.close() return d # for each dataset, make a column gene_id, sample_id def make_chip_data(fname): ''' fname - a narrowPeak file given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength). ''' d = {} if not os.path.exists(fname): print('get_binding: cannot find file %s' % (fname)) sys.exit() f = open(fname) lines = f.readlines() f.close() if not check_valid_peak_line(lines[0]): # very basic check print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname)) return d for line in lines: line = line.strip() lst = line.split() c = lst[0] rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it. if rindex != -1: c = c[rindex+1:].strip() ss = int(lst[1]) ee = int(lst[2]) strength = lst[4] # the 5th column is used as strength if not c in d: d[c] = [(ss, ee, strength)] else: d[c].append((ss, ee, strength)) for k in d: d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position return d def make_data_dict(fname): ''' fname is paramter_for_buildCmatrix.txt ''' d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100 f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line == '' or line.startswith('#'): continue if line.startswith(DATA_SYMBOL): s = line[line.rfind(DATA_SYMBOL[-1])+1:] s = s.strip() if s in d: print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s)) sys.exit() d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} if line.startswith('DESCRIPTION:'): d[s]['DESCRIPTION'] = get_value(line, ':') elif line.startswith('PROTEN_NAME:'): d[s]['PROTEN_NAME'] = get_value(line, ':') elif line.startswith('PROTEIN_ID:'): d[s]['PROTEIN_ID'] = get_value(line, ':') elif line.startswith('DATA_NAME:'): d[s]['DATA_NAME'] = get_value(line, ':') elif line.startswith('DATA_FORMAT:'): d[s]['DATA_FORMAT'] = get_value(line, ':') elif line.startswith('LOCATION:'): d[s]['LOCATION'] = get_value(line, ':') if os.path.exists(d[s]['LOCATION']): d['ID_LIST'].append(s) else: print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s. Ignore this ID.' % (d[s]['LOCATION'], s)) elif line.startswith('NOTE:'): d[s]['NOTE'] = get_value(line, ':') elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): make_local_parameter(d[s]['PARAM'], line) return d def get_interval(c, s, e, o, flank, chr_info_dict): ''' o --- + positive strand, don't include gene body ++ positive strand, include gene body - negative strand, don't include gene body -- negative strand, include gene body * include both sides and gene body ''' if o == '+': return (np.max([0, s-flank]), s) elif o == '++': # also include gene body return (np.max([0, s-flank]), e) elif o == '-': return (e, np.min([e+flank, chr_info_dict[c]])) elif o == '--': return (s, np.min([e+flank, chr_info_dict[c]])) elif o == '*': # both sides and gene body return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) ) return (-1, -1) # should not be here def check_valid_peak_line(s): s = s.strip() lst = s.split() if lst[0].isdigit() or lst[0].lower().startswith('chr'): return True return False def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict): ''' check with a given interval has intersection with peaks from a ChIP-seq ''' if len(chip_dict) == 0 or not chromosome in chip_dict: return [] result = [] lst = chip_dict[chromosome] # get a list of intervals in that chromosome n = len(lst) slst, elst, strength_lst = zip(*lst) # make three sub-lists index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position sublst = lst[index1:index2] # these intervals potentially have intersections with given interval #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2)) for t in sublst: ss = t[0] ee = t[1] strength = t[2] if start_pos <= ee and end_pos >= ss: result.append(int(strength)) #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee)) return result def make_o(c, glb_param_dict): ''' make orientation ''' ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES' if both_side: return '*' if ig: return 2*c # also include gene body return c def calc_intensity(t, bw, norm_by_gene_length=0): ''' t - a tuple, chromosome, start and end position ''' chromosome = t[0] start = t[1] end = t[2] if end < start: print('get_binding.py calc_intensity: start position must be less than end position') sys.exit() I = bw.intervals(chromosome, start, end) # a list of intervals sum_area = 0.0 if I: # is I is not empty for t in I: s0 = max(start, t[0]) e0 = min(end, t[1]) h = t[2] #print('=== %d %d %g' % (s0, e0, h)) sum_area += h * (e0 - s0) if norm_by_gene_length != 0: return 1000.0 * sum_area / (end - start) else: return 1.0 * sum_area def get_update_date(s): index = s.find('update:') if index < 0: return None result = s[s.rfind('update:')+7:].strip() if result.isdigit() and len(result) == 6: return result else: return '00000000' def make_table(gene_file, data_dict, glb_param_dict): ''' Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional). ''' if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all id_lst = data_dict['ID_LIST'] else: id_lst = glb_param_dict['REBUILD_LIST'] for c in id_lst: if not c in data_dict['ID_LIST']: print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c)) sys.exit() chip_data_dict = {} for myid in id_lst: chip_file = data_dict[myid]['LOCATION'] if not os.path.exists(chip_file): print('get_binding: file %s dose not exists.' % (chip_file)) sys.exit() if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK': chip_data = make_chip_data(chip_file) elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW': chip_data = pyBigWig.open(chip_file) else: print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT'])) sys.exit() chip_data_dict[myid] = chip_data f = open(gene_file) lines = f.readlines() # lines contain all lines in gene_file.txt f.close() dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here if not os.path.isdir(dest): os.makedirs(dest) for myid in id_lst: # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt #print('Processing %s' % (myid)) fname = os.path.join(dest, myid + '.txt') if os.path.exists(fname): # this file has been created before. Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date. file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d') file_update_time = get_update_date(data_dict[myid]['NOTE']) if file_update_time == None: continue if file_update_time <= file_modification_time: continue f = open(fname, 'w') content = 'gene_id\t%s\n' % (myid) for line in lines: line = line.strip() lst = line.split('\t') gene_id = lst[0] gene_name = lst[1] c = lst[2] # chromosome, a letter, e.g., '1', '2', etc. s = int(lst[3]) e = int(lst[4]) o = make_o(lst[5], glb_param_dict) # gene oritentation, + or - flank = int(glb_param_dict['TARGET_RANGE']) interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream s0 = gene_id if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK': intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid]) if intersection_list != []: s0 += '\t' + str(np.max(intersection_list)) # get the maximum value in the interval [change] else: s0 += '\t' + '0' if data_dict[myid]['DATA_FORMAT'].upper() == 'BW': t = (c, interval[0], interval[1]) try: z = calc_intensity(t, chip_data_dict[myid], 1) except RuntimeError: z = -1.0 s0 += '\t' + '%4.2f' % (z) content += s0 + '\n' f.write(content) f.close() # close bw files for myid in id_lst: if data_dict[myid]['DATA_FORMAT'].upper() == 'BW': chip_data_dict[myid].close() ## main param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt global_param_dict = make_global_param_dict(param_file) data_dict = make_data_dict(param_file) print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.' % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST']))) make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)