1 files changed, 119 insertions, 0 deletions
diff --git a/Code/common_peak.py b/Code/common_peak.py
new file mode 100644
index 0000000..c36899c
--- /dev/null
+++ b/Code/common_peak.py
@@ -0,0 +1,119 @@
+# Usage: python common_peak.py dir
+# Purpose: Return common peaks for all narrowPekas in directory dir
+
+import glob
+import sys, os, operator, bisect
+import numpy as np
+
+def check_valid_peak_line(s):
+    s = s.strip()
+    lst = s.split()
+    if lst[0].isdigit() or lst[0].lower().startswith('chr'):
+        return True
+    return False
+
+def make_chip_data(fname):
+    ''' given a file, return a dictionary. key is chromosome number. value is a list. '''
+
+    d = {}
+    if not os.path.exists(fname):
+        print('Cannot find file %s' % (fname))
+        sys.exit()
+
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    if not check_valid_peak_line(lines[0]):
+        return d
+
+    strength_lst = []
+    for line in lines:
+        line = line.strip()
+        lst = line.split()
+        strength = lst[4]
+        strength_lst.append(int(strength))
+    strength_lst = np.array(strength_lst)
+    tau = np.percentile(strength_lst, 25)  # only include strong peaks
+
+    for line in lines:
+        line = line.strip()
+        lst = line.split()
+        c = lst[0]
+
+        ss = int(lst[1])
+        ee = int(lst[2])
+        strength = lst[4]
+        if int(strength) >= tau:
+            if not c in d:
+                d[c] = [(ss, ee, strength)]
+            else:
+                d[c].append((ss, ee, strength))
+
+    for k in d:
+        d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
+    return d
+
+def make_all(dir):
+    d = {}
+    for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')):
+        d[fname] = make_chip_data(fname)
+    return d
+
+
+def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
+
+    if len(chip_dict) == 0 or not chromosome in chip_dict:
+        return False
+
+    lst = chip_dict[chromosome] # get a list of intervals in that chromosome
+    n = len(lst)
+    slst, elst, strength_lst = zip(*lst) # make three sub-lists
+    index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position
+    index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position
+    sublst = lst[index1:index2]
+    #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
+    for t in sublst:
+        ss = t[0]
+        ee = t[1]
+        strength = t[2]
+        if start_pos <= ee and end_pos >= ss:
+            return True
+            #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
+    return False
+
+def get_frequent_peaks(d):
+
+    s = ''
+    num_files = len(d)
+    for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']:
+        total_lst = []
+        for k in d: # k is file name
+            if c in d[k]:
+                total_lst.extend(d[k][c])
+        total_lst = sorted(total_lst, key=operator.itemgetter(0, 1))
+        for t in total_lst:  # for each interval
+            count = 0
+            for k in d:
+                chromosome = c
+                start_pos = t[0]
+                end_pos =   t[1]
+                chip_dict = d[k]
+                result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict)
+                if result == True:
+                    count += 1
+
+            if count >= int(num_files * 0.75):
+                line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ]))
+                s += line
+
+    return s
+
+
+## main
+d = make_all(sys.argv[1])
+fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak'
+#print(fname)
+s = get_frequent_peaks(d)
+f = open(fname, 'w')
+f.write(s)
+f.close()