summaryrefslogtreecommitdiff
path: root/Code/get_binding.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/get_binding.py')
-rw-r--r--Code/get_binding.py384
1 files changed, 384 insertions, 0 deletions
diff --git a/Code/get_binding.py b/Code/get_binding.py
new file mode 100644
index 0000000..e4087ca
--- /dev/null
+++ b/Code/get_binding.py
@@ -0,0 +1,384 @@
+# Usage: python get_binding.py parameter_for_buildCmatrix.txt
+#
+# Manually change CHR_INFO in parameter_for_buildCmatrix.txt if
+# your organism is not Arabidopsis. Set REBUILD_LIST in
+# parameter_for_buildCmatrix.txt if you only want to make
+# binding column for a few ChIP-seq IDs (useful when adding new
+# ChIP-seq data but not wanting to re-generate existing binding
+# files).
+#
+#
+# Purpose: make individual binding column files, one for each ChIP-seq
+# ID. These files will be combined by buildCmatrix.py into a
+# big binding matrix, binding.txt.
+#
+# A typical column file looks like:
+#
+# gene_id C0003000001450
+# AT1G01010 0
+# AT1G01020 0
+# AT1G01030 0
+# AT1G01040 0
+# AT1G01046 0
+# AT1G01050 0
+# AT1G01060 0
+# ...
+#
+# Last modified 5 APR 2017 slcu hui
+# Last modified 4 AUG 2019 zjnu hui
+
+
+import sys, os, operator, bisect
+import numpy as np
+import pyBigWig
+from datetime import datetime
+
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+LCL_PARAM_SYMBOL = '%'
+DATA_SYMBOL = '@'
+####################################
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1)
+ return lst[1].strip()
+
+
+def make_chromosome_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ lst = line.split('\t')
+ k = lst[0]
+ v = lst[1]
+ d[k] = int(v)
+ return d
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ if k == 'REBUILD_LIST' and v.lower() != 'all':
+ d[k] = v.split() # make a list and rewrite d[k]
+ elif k == 'CHR_INFO' and os.path.exists(v):
+ d[k] = make_chromosome_dict(v)
+ else:
+ d[k] = v
+
+ if len(d['CHR_INFO']) == 0:
+ print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.')
+ sys.exit()
+
+ f.close()
+ return d
+
+
+# for each dataset, make a column gene_id, sample_id
+def make_chip_data(fname):
+ ''' fname - a narrowPeak file
+ given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength).
+ '''
+
+ d = {}
+ if not os.path.exists(fname):
+ print('get_binding: cannot find file %s' % (fname))
+ sys.exit()
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ if not check_valid_peak_line(lines[0]): # very basic check
+ print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname))
+ return d
+ for line in lines:
+ line = line.strip()
+ lst = line.split()
+ c = lst[0]
+
+ rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it.
+ if rindex != -1:
+ c = c[rindex+1:].strip()
+
+ ss = int(lst[1])
+ ee = int(lst[2])
+ strength = lst[4] # the 5th column is used as strength
+ if not c in d:
+ d[c] = [(ss, ee, strength)]
+ else:
+ d[c].append((ss, ee, strength))
+
+ for k in d:
+ d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
+ return d
+
+
+def make_data_dict(fname):
+ ''' fname is paramter_for_buildCmatrix.txt '''
+
+ d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ if os.path.exists(d[s]['LOCATION']):
+ d['ID_LIST'].append(s)
+ else:
+ print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s. Ignore this ID.' % (d[s]['LOCATION'], s))
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+ elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ make_local_parameter(d[s]['PARAM'], line)
+
+ return d
+
+
+def get_interval(c, s, e, o, flank, chr_info_dict):
+ '''
+ o --- + positive strand, don't include gene body
+ ++ positive strand, include gene body
+ - negative strand, don't include gene body
+ -- negative strand, include gene body
+ * include both sides and gene body
+ '''
+
+ if o == '+':
+ return (np.max([0, s-flank]), s)
+ elif o == '++': # also include gene body
+ return (np.max([0, s-flank]), e)
+ elif o == '-':
+ return (e, np.min([e+flank, chr_info_dict[c]]))
+ elif o == '--':
+ return (s, np.min([e+flank, chr_info_dict[c]]))
+ elif o == '*': # both sides and gene body
+ return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) )
+ return (-1, -1) # should not be here
+
+
+def check_valid_peak_line(s):
+ s = s.strip()
+ lst = s.split()
+ if lst[0].isdigit() or lst[0].lower().startswith('chr'):
+ return True
+ return False
+
+
+def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
+ ''' check with a given interval has intersection with peaks from a ChIP-seq '''
+ if len(chip_dict) == 0 or not chromosome in chip_dict:
+ return []
+
+ result = []
+ lst = chip_dict[chromosome] # get a list of intervals in that chromosome
+ n = len(lst)
+ slst, elst, strength_lst = zip(*lst) # make three sub-lists
+ index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position
+ index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position
+ sublst = lst[index1:index2] # these intervals potentially have intersections with given interval
+ #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
+ for t in sublst:
+ ss = t[0]
+ ee = t[1]
+ strength = t[2]
+ if start_pos <= ee and end_pos >= ss:
+ result.append(int(strength))
+ #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
+ return result
+
+
+def make_o(c, glb_param_dict):
+ ''' make orientation '''
+ ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body
+ both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES'
+ if both_side:
+ return '*'
+ if ig:
+ return 2*c # also include gene body
+ return c
+
+
+def calc_intensity(t, bw, norm_by_gene_length=0):
+ '''
+ t - a tuple, chromosome, start and end position
+ '''
+ chromosome = t[0]
+ start = t[1]
+ end = t[2]
+ if end < start:
+ print('get_binding.py calc_intensity: start position must be less than end position')
+ sys.exit()
+
+ I = bw.intervals(chromosome, start, end) # a list of intervals
+
+ sum_area = 0.0
+ if I: # is I is not empty
+ for t in I:
+ s0 = max(start, t[0])
+ e0 = min(end, t[1])
+ h = t[2]
+ #print('=== %d %d %g' % (s0, e0, h))
+ sum_area += h * (e0 - s0)
+
+ if norm_by_gene_length != 0:
+ return 1000.0 * sum_area / (end - start)
+ else:
+ return 1.0 * sum_area
+
+
+def get_update_date(s):
+ index = s.find('update:')
+ if index < 0:
+ return None
+ result = s[s.rfind('update:')+7:].strip()
+ if result.isdigit() and len(result) == 6:
+ return result
+ else:
+ return '00000000'
+
+
+def make_table(gene_file, data_dict, glb_param_dict):
+ '''
+ Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional).
+ '''
+
+ if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all
+ id_lst = data_dict['ID_LIST']
+ else:
+ id_lst = glb_param_dict['REBUILD_LIST']
+ for c in id_lst:
+ if not c in data_dict['ID_LIST']:
+ print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c))
+ sys.exit()
+
+ chip_data_dict = {}
+ for myid in id_lst:
+ chip_file = data_dict[myid]['LOCATION']
+
+ if not os.path.exists(chip_file):
+ print('get_binding: file %s dose not exists.' % (chip_file))
+ sys.exit()
+
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ chip_data = make_chip_data(chip_file)
+
+ elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+ chip_data = pyBigWig.open(chip_file)
+
+ else:
+ print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT']))
+ sys.exit()
+
+ chip_data_dict[myid] = chip_data
+
+
+ f = open(gene_file)
+ lines = f.readlines() # lines contain all lines in gene_file.txt
+ f.close()
+
+ dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here
+ if not os.path.isdir(dest):
+ os.makedirs(dest)
+
+ for myid in id_lst: # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt
+ #print('Processing %s' % (myid))
+ fname = os.path.join(dest, myid + '.txt')
+ if os.path.exists(fname): # this file has been created before. Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date.
+ file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d')
+ file_update_time = get_update_date(data_dict[myid]['NOTE'])
+ if file_update_time == None:
+ continue
+ if file_update_time <= file_modification_time:
+ continue
+
+ f = open(fname, 'w')
+ content = 'gene_id\t%s\n' % (myid)
+
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ gene_id = lst[0]
+ gene_name = lst[1]
+ c = lst[2] # chromosome, a letter, e.g., '1', '2', etc.
+ s = int(lst[3])
+ e = int(lst[4])
+ o = make_o(lst[5], glb_param_dict) # gene oritentation, + or -
+
+ flank = int(glb_param_dict['TARGET_RANGE'])
+ interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream
+
+ s0 = gene_id
+
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid])
+ if intersection_list != []:
+ s0 += '\t' + str(np.max(intersection_list)) # get the maximum value in the interval [change]
+ else:
+ s0 += '\t' + '0'
+
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+ t = (c, interval[0], interval[1])
+ try:
+ z = calc_intensity(t, chip_data_dict[myid], 1)
+ except RuntimeError:
+ z = -1.0
+ s0 += '\t' + '%4.2f' % (z)
+
+ content += s0 + '\n'
+
+ f.write(content)
+ f.close()
+
+ # close bw files
+ for myid in id_lst:
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+ chip_data_dict[myid].close()
+
+
+
+## main
+param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.'
+ % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST'])))
+make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)