Code/common_peak.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

# Usage: python common_peak.py dir
# Purpose: Return common peaks for all narrowPekas in directory dir

import glob
import sys, os, operator, bisect
import numpy as np

def check_valid_peak_line(s):
    s = s.strip()
    lst = s.split()
    if lst[0].isdigit() or lst[0].lower().startswith('chr'):
        return True
    return False

def make_chip_data(fname):
    ''' given a file, return a dictionary. key is chromosome number. value is a list. '''

    d = {}
    if not os.path.exists(fname):
        print('Cannot find file %s' % (fname))
        sys.exit()

    f = open(fname)
    lines = f.readlines()
    f.close()
    if not check_valid_peak_line(lines[0]):
        return d

    strength_lst = []
    for line in lines:
        line = line.strip()
        lst = line.split()
        strength = lst[4]
        strength_lst.append(int(strength))
    strength_lst = np.array(strength_lst)
    tau = np.percentile(strength_lst, 25)  # only include strong peaks

    for line in lines:
        line = line.strip()
        lst = line.split()
        c = lst[0]

        ss = int(lst[1])
        ee = int(lst[2])
        strength = lst[4]
        if int(strength) >= tau:
            if not c in d:
                d[c] = [(ss, ee, strength)]
            else:
                d[c].append((ss, ee, strength))

    for k in d:
        d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
    return d

def make_all(dir):
    d = {}
    for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')):
        d[fname] = make_chip_data(fname)
    return d


def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):

    if len(chip_dict) == 0 or not chromosome in chip_dict:
        return False

    lst = chip_dict[chromosome] # get a list of intervals in that chromosome
    n = len(lst)
    slst, elst, strength_lst = zip(*lst) # make three sub-lists
    index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position
    index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position
    sublst = lst[index1:index2]
    #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
    for t in sublst:
        ss = t[0]
        ee = t[1]
        strength = t[2]
        if start_pos <= ee and end_pos >= ss:
            return True
            #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
    return False

def get_frequent_peaks(d):

    s = ''
    num_files = len(d)
    for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']:
        total_lst = []
        for k in d: # k is file name
            if c in d[k]:
                total_lst.extend(d[k][c])
        total_lst = sorted(total_lst, key=operator.itemgetter(0, 1))
        for t in total_lst:  # for each interval
            count = 0
            for k in d:
                chromosome = c
                start_pos = t[0]
                end_pos =   t[1]
                chip_dict = d[k]
                result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict)
                if result == True:
                    count += 1

            if count >= int(num_files * 0.75):
                line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ]))
                s += line

    return s


## main
d = make_all(sys.argv[1])
fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak'
#print(fname)
s = get_frequent_peaks(d)
f = open(fname, 'w')
f.write(s)
f.close()