# Usage: python common_peak.py dir # Purpose: Return common peaks for all narrowPekas in directory dir import glob import sys, os, operator, bisect import numpy as np def check_valid_peak_line(s): s = s.strip() lst = s.split() if lst[0].isdigit() or lst[0].lower().startswith('chr'): return True return False def make_chip_data(fname): ''' given a file, return a dictionary. key is chromosome number. value is a list. ''' d = {} if not os.path.exists(fname): print('Cannot find file %s' % (fname)) sys.exit() f = open(fname) lines = f.readlines() f.close() if not check_valid_peak_line(lines[0]): return d strength_lst = [] for line in lines: line = line.strip() lst = line.split() strength = lst[4] strength_lst.append(int(strength)) strength_lst = np.array(strength_lst) tau = np.percentile(strength_lst, 25) # only include strong peaks for line in lines: line = line.strip() lst = line.split() c = lst[0] ss = int(lst[1]) ee = int(lst[2]) strength = lst[4] if int(strength) >= tau: if not c in d: d[c] = [(ss, ee, strength)] else: d[c].append((ss, ee, strength)) for k in d: d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position return d def make_all(dir): d = {} for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')): d[fname] = make_chip_data(fname) return d def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict): if len(chip_dict) == 0 or not chromosome in chip_dict: return False lst = chip_dict[chromosome] # get a list of intervals in that chromosome n = len(lst) slst, elst, strength_lst = zip(*lst) # make three sub-lists index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position sublst = lst[index1:index2] #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2)) for t in sublst: ss = t[0] ee = t[1] strength = t[2] if start_pos <= ee and end_pos >= ss: return True #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee)) return False def get_frequent_peaks(d): s = '' num_files = len(d) for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']: total_lst = [] for k in d: # k is file name if c in d[k]: total_lst.extend(d[k][c]) total_lst = sorted(total_lst, key=operator.itemgetter(0, 1)) for t in total_lst: # for each interval count = 0 for k in d: chromosome = c start_pos = t[0] end_pos = t[1] chip_dict = d[k] result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict) if result == True: count += 1 if count >= int(num_files * 0.75): line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ])) s += line return s ## main d = make_all(sys.argv[1]) fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak' #print(fname) s = get_frequent_peaks(d) f = open(fname, 'w') f.write(s) f.close()