1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
# Usage: python common_peak.py dir
# Purpose: Return common peaks for all narrowPekas in directory dir
import glob
import sys, os, operator, bisect
import numpy as np
def check_valid_peak_line(s):
s = s.strip()
lst = s.split()
if lst[0].isdigit() or lst[0].lower().startswith('chr'):
return True
return False
def make_chip_data(fname):
''' given a file, return a dictionary. key is chromosome number. value is a list. '''
d = {}
if not os.path.exists(fname):
print('Cannot find file %s' % (fname))
sys.exit()
f = open(fname)
lines = f.readlines()
f.close()
if not check_valid_peak_line(lines[0]):
return d
strength_lst = []
for line in lines:
line = line.strip()
lst = line.split()
strength = lst[4]
strength_lst.append(int(strength))
strength_lst = np.array(strength_lst)
tau = np.percentile(strength_lst, 25) # only include strong peaks
for line in lines:
line = line.strip()
lst = line.split()
c = lst[0]
ss = int(lst[1])
ee = int(lst[2])
strength = lst[4]
if int(strength) >= tau:
if not c in d:
d[c] = [(ss, ee, strength)]
else:
d[c].append((ss, ee, strength))
for k in d:
d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
return d
def make_all(dir):
d = {}
for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')):
d[fname] = make_chip_data(fname)
return d
def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
if len(chip_dict) == 0 or not chromosome in chip_dict:
return False
lst = chip_dict[chromosome] # get a list of intervals in that chromosome
n = len(lst)
slst, elst, strength_lst = zip(*lst) # make three sub-lists
index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position
index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position
sublst = lst[index1:index2]
#print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
for t in sublst:
ss = t[0]
ee = t[1]
strength = t[2]
if start_pos <= ee and end_pos >= ss:
return True
#print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
return False
def get_frequent_peaks(d):
s = ''
num_files = len(d)
for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']:
total_lst = []
for k in d: # k is file name
if c in d[k]:
total_lst.extend(d[k][c])
total_lst = sorted(total_lst, key=operator.itemgetter(0, 1))
for t in total_lst: # for each interval
count = 0
for k in d:
chromosome = c
start_pos = t[0]
end_pos = t[1]
chip_dict = d[k]
result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict)
if result == True:
count += 1
if count >= int(num_files * 0.75):
line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ]))
s += line
return s
## main
d = make_all(sys.argv[1])
fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak'
#print(fname)
s = get_frequent_peaks(d)
f = open(fname, 'w')
f.write(s)
f.close()
|