diff options
Diffstat (limited to 'Code/common_peak.py')
-rw-r--r-- | Code/common_peak.py | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/Code/common_peak.py b/Code/common_peak.py new file mode 100644 index 0000000..c36899c --- /dev/null +++ b/Code/common_peak.py @@ -0,0 +1,119 @@ +# Usage: python common_peak.py dir +# Purpose: Return common peaks for all narrowPekas in directory dir + +import glob +import sys, os, operator, bisect +import numpy as np + +def check_valid_peak_line(s): + s = s.strip() + lst = s.split() + if lst[0].isdigit() or lst[0].lower().startswith('chr'): + return True + return False + +def make_chip_data(fname): + ''' given a file, return a dictionary. key is chromosome number. value is a list. ''' + + d = {} + if not os.path.exists(fname): + print('Cannot find file %s' % (fname)) + sys.exit() + + f = open(fname) + lines = f.readlines() + f.close() + if not check_valid_peak_line(lines[0]): + return d + + strength_lst = [] + for line in lines: + line = line.strip() + lst = line.split() + strength = lst[4] + strength_lst.append(int(strength)) + strength_lst = np.array(strength_lst) + tau = np.percentile(strength_lst, 25) # only include strong peaks + + for line in lines: + line = line.strip() + lst = line.split() + c = lst[0] + + ss = int(lst[1]) + ee = int(lst[2]) + strength = lst[4] + if int(strength) >= tau: + if not c in d: + d[c] = [(ss, ee, strength)] + else: + d[c].append((ss, ee, strength)) + + for k in d: + d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position + return d + +def make_all(dir): + d = {} + for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')): + d[fname] = make_chip_data(fname) + return d + + +def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict): + + if len(chip_dict) == 0 or not chromosome in chip_dict: + return False + + lst = chip_dict[chromosome] # get a list of intervals in that chromosome + n = len(lst) + slst, elst, strength_lst = zip(*lst) # make three sub-lists + index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position + index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position + sublst = lst[index1:index2] + #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2)) + for t in sublst: + ss = t[0] + ee = t[1] + strength = t[2] + if start_pos <= ee and end_pos >= ss: + return True + #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee)) + return False + +def get_frequent_peaks(d): + + s = '' + num_files = len(d) + for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']: + total_lst = [] + for k in d: # k is file name + if c in d[k]: + total_lst.extend(d[k][c]) + total_lst = sorted(total_lst, key=operator.itemgetter(0, 1)) + for t in total_lst: # for each interval + count = 0 + for k in d: + chromosome = c + start_pos = t[0] + end_pos = t[1] + chip_dict = d[k] + result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict) + if result == True: + count += 1 + + if count >= int(num_files * 0.75): + line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ])) + s += line + + return s + + +## main +d = make_all(sys.argv[1]) +fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak' +#print(fname) +s = get_frequent_peaks(d) +f = open(fname, 'w') +f.write(s) +f.close() |