summaryrefslogtreecommitdiff
path: root/Code/common_peak.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/common_peak.py')
-rw-r--r--Code/common_peak.py119
1 files changed, 119 insertions, 0 deletions
diff --git a/Code/common_peak.py b/Code/common_peak.py
new file mode 100644
index 0000000..c36899c
--- /dev/null
+++ b/Code/common_peak.py
@@ -0,0 +1,119 @@
+# Usage: python common_peak.py dir
+# Purpose: Return common peaks for all narrowPekas in directory dir
+
+import glob
+import sys, os, operator, bisect
+import numpy as np
+
+def check_valid_peak_line(s):
+ s = s.strip()
+ lst = s.split()
+ if lst[0].isdigit() or lst[0].lower().startswith('chr'):
+ return True
+ return False
+
+def make_chip_data(fname):
+ ''' given a file, return a dictionary. key is chromosome number. value is a list. '''
+
+ d = {}
+ if not os.path.exists(fname):
+ print('Cannot find file %s' % (fname))
+ sys.exit()
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ if not check_valid_peak_line(lines[0]):
+ return d
+
+ strength_lst = []
+ for line in lines:
+ line = line.strip()
+ lst = line.split()
+ strength = lst[4]
+ strength_lst.append(int(strength))
+ strength_lst = np.array(strength_lst)
+ tau = np.percentile(strength_lst, 25) # only include strong peaks
+
+ for line in lines:
+ line = line.strip()
+ lst = line.split()
+ c = lst[0]
+
+ ss = int(lst[1])
+ ee = int(lst[2])
+ strength = lst[4]
+ if int(strength) >= tau:
+ if not c in d:
+ d[c] = [(ss, ee, strength)]
+ else:
+ d[c].append((ss, ee, strength))
+
+ for k in d:
+ d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
+ return d
+
+def make_all(dir):
+ d = {}
+ for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')):
+ d[fname] = make_chip_data(fname)
+ return d
+
+
+def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
+
+ if len(chip_dict) == 0 or not chromosome in chip_dict:
+ return False
+
+ lst = chip_dict[chromosome] # get a list of intervals in that chromosome
+ n = len(lst)
+ slst, elst, strength_lst = zip(*lst) # make three sub-lists
+ index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position
+ index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position
+ sublst = lst[index1:index2]
+ #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
+ for t in sublst:
+ ss = t[0]
+ ee = t[1]
+ strength = t[2]
+ if start_pos <= ee and end_pos >= ss:
+ return True
+ #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
+ return False
+
+def get_frequent_peaks(d):
+
+ s = ''
+ num_files = len(d)
+ for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']:
+ total_lst = []
+ for k in d: # k is file name
+ if c in d[k]:
+ total_lst.extend(d[k][c])
+ total_lst = sorted(total_lst, key=operator.itemgetter(0, 1))
+ for t in total_lst: # for each interval
+ count = 0
+ for k in d:
+ chromosome = c
+ start_pos = t[0]
+ end_pos = t[1]
+ chip_dict = d[k]
+ result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict)
+ if result == True:
+ count += 1
+
+ if count >= int(num_files * 0.75):
+ line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ]))
+ s += line
+
+ return s
+
+
+## main
+d = make_all(sys.argv[1])
+fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak'
+#print(fname)
+s = get_frequent_peaks(d)
+f = open(fname, 'w')
+f.write(s)
+f.close()