diff options
author | Lan Hui <lanhui@zjnu.edu.cn> | 2024-08-17 10:11:30 +0800 |
---|---|---|
committer | Lan Hui <lanhui@zjnu.edu.cn> | 2024-08-17 10:11:30 +0800 |
commit | f4ce5d7a1b5fa133677219c951ff01d56ce4cb44 (patch) | |
tree | ab05293b0ed138181355b8e886d3febae55edf94 | |
parent | 2f3485a486100d1731229ac018f24cee33734777 (diff) |
merge all TPM files and put the merged file in folder assemble
-rw-r--r-- | Code/mergeTPM.py | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/Code/mergeTPM.py b/Code/mergeTPM.py new file mode 100644 index 0000000..b7ae49b --- /dev/null +++ b/Code/mergeTPM.py @@ -0,0 +1,96 @@ +# Purpose: merge all TPM files in PATH_TO_TPM and put the merge file to PATH_TO_TPM/assemble/ +# Usage: python3 mergeTPM.py +# 17 Aug 2024, zjnu, hui + +import os, glob, gzip + +PATH_TO_TPM = '../Data/history/expr/' + +tpm_files_to_include = glob.glob(os.path.join(PATH_TO_TPM, 'TPM*')) +#tpm_files_to_include = ['../Data/history/expr/TPM.inhouse.diurnal.txt', '../Data/history/expr/TPM.20190919.txt'] + +# Get all run IDs +run_ids = set() +for filename in tpm_files_to_include: + print(filename) + if filename.endswith('txt'): + with open(filename) as f: + line = f.readlines()[0] + line = line.strip() + lst = line.split('\t') + for runid in lst[1:]: + run_ids.add(runid) + elif filename.endswith('gz'): + with gzip.open(filename, 'rt') as f: + line = f.readlines()[0] + line = line.strip() + lst = line.split('\t') + for runid in lst[1:]: + run_ids.add(runid) + +print('Total unique run IDs: %d' % len(run_ids)) + +# Get gene IDs +gene_ids = [] +with open(os.path.join(PATH_TO_TPM, 'TPM.txt')) as f: + for line in f: + line = line.strip() + if line.startswith('AT'): + lst = line.split('\t') + gene_ids.append(lst[0]) + +assert len(gene_ids) == 33602 + + +# Assemble gene expressions +g = {} # {'run':{'g1':1, 'g2':2}} +def populate_dictionary(d, lines): + line = lines[0].strip() + runs = line.split('\t')[1:] + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + gene = lst[0] + expression_levels = lst[1:] + run_index = 0 + for x in expression_levels: + run = runs[run_index] + if not run in d: + d[run] = {} + d[run][gene] = x + run_index += 1 + + +for filename in tpm_files_to_include: + print('Assemble ' + filename) + if filename.endswith('txt'): + with open(filename) as f: + lines = f.readlines() + populate_dictionary(g, lines) + elif filename.endswith('gz'): + with gzip.open(filename, 'rt') as f: + lines = f.readlines() + populate_dictionary(g, lines) + + +# Write to TPM.assemble.number.txt +run_ids_sorted = sorted(list(run_ids)) +assemble_dir = os.path.join(PATH_TO_TPM, 'assemble') +if not os.path.exists(assemble_dir): + os.mkdir(assemble_dir) +fname = os.path.join(assemble_dir, 'assemble.%d.txt' % (len(run_ids))) +print(f'Write to {fname}') +with open(fname, 'w') as f: + # write first line + lst1 = ['gene_id'] + run_ids_sorted + f.write('\t'.join(lst1) + '\n') + for gene in gene_ids: + lst2 = [gene] + for run in run_ids_sorted: + if gene in g[run]: + lst2.append(g[run][gene]) + else: + lst2.append('NA') + f.write('\t'.join(lst2) + '\n') + assert len(lst1) == len(lst2) + |