Code/mergeTPM.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

# Purpose: merge all TPM files in PATH_TO_TPM and put the merged file to PATH_TO_TPM/assemble/
# Usage: python3 mergeTPM.py
# 17 Aug 2024, zjnu, hui

import os, glob, gzip

PATH_TO_TPM = '../Data/history/expr/'

tpm_files_to_include = glob.glob(os.path.join(PATH_TO_TPM, 'TPM*'))
#tpm_files_to_include = ['../Data/history/expr/TPM.inhouse.diurnal.txt', '../Data/history/expr/TPM.20190919.txt']

# Get all run IDs
run_ids = set()
for filename in tpm_files_to_include:
    print(filename)
    if filename.endswith('txt'):
        with open(filename) as f:
            line = f.readlines()[0]
            line = line.strip()
            lst = line.split('\t')
            for runid in lst[1:]:
                run_ids.add(runid)
    elif filename.endswith('gz'):
        with gzip.open(filename, 'rt') as f:
            line = f.readlines()[0]
            line = line.strip()
            lst = line.split('\t')
            for runid in lst[1:]:
                run_ids.add(runid)

print('Total unique run IDs: %d' % len(run_ids))

# Get gene IDs
gene_ids = []
with open(os.path.join(PATH_TO_TPM, 'TPM.txt')) as f:
    for line in f:
        line = line.strip()
        if line.startswith('AT'):
            lst = line.split('\t')
            gene_ids.append(lst[0])

assert len(gene_ids) == 33602


# Assemble gene expressions
g = {} # {'run':{'g1':1, 'g2':2}}
def populate_dictionary(d, lines):
    line = lines[0].strip()
    runs = line.split('\t')[1:]
    for line in lines[1:]:
        line = line.strip()
        lst = line.split('\t')
        gene = lst[0]
        expression_levels = lst[1:]
        run_index = 0
        for x in expression_levels:
            run = runs[run_index]
            if not run in d:
                d[run] = {}
            d[run][gene] = x
            run_index += 1
    

for filename in tpm_files_to_include:
    print('Assemble ' + filename)
    if filename.endswith('txt'):
        with open(filename) as f:
            lines = f.readlines()
            populate_dictionary(g, lines)            
    elif filename.endswith('gz'):
        with gzip.open(filename, 'rt') as f:
            lines = f.readlines()
            populate_dictionary(g, lines)            
    

# Write to TPM.assemble.number.txt
run_ids_sorted = sorted(list(run_ids))
assemble_dir = os.path.join(PATH_TO_TPM, 'assemble')
if not os.path.exists(assemble_dir):
    os.mkdir(assemble_dir)
fname = os.path.join(assemble_dir, 'assemble.%d.txt' % (len(run_ids)))
print(f'Write to {fname}')
with open(fname, 'w') as f:
    # write first line
    lst1 = ['gene_id'] + run_ids_sorted
    f.write('\t'.join(lst1) + '\n')
    for gene in gene_ids:
        lst2 = [gene]
        for run in run_ids_sorted:
            if gene in g[run]:
                lst2.append(g[run][gene])
            else:
                lst2.append('NA')
        f.write('\t'.join(lst2) + '\n')
        assert len(lst1) == len(lst2)