# Purpose: merge all TPM files in PATH_TO_TPM and put the merged file to PATH_TO_TPM/assemble/ # Usage: python3 mergeTPM.py # 17 Aug 2024, zjnu, hui import os, glob, gzip PATH_TO_TPM = '../Data/history/expr/' tpm_files_to_include = glob.glob(os.path.join(PATH_TO_TPM, 'TPM*')) #tpm_files_to_include = ['../Data/history/expr/TPM.inhouse.diurnal.txt', '../Data/history/expr/TPM.20190919.txt'] # Get all run IDs run_ids = set() for filename in tpm_files_to_include: print(filename) if filename.endswith('txt'): with open(filename) as f: line = f.readlines()[0] line = line.strip() lst = line.split('\t') for runid in lst[1:]: run_ids.add(runid) elif filename.endswith('gz'): with gzip.open(filename, 'rt') as f: line = f.readlines()[0] line = line.strip() lst = line.split('\t') for runid in lst[1:]: run_ids.add(runid) print('Total unique run IDs: %d' % len(run_ids)) # Get gene IDs gene_ids = [] with open(os.path.join(PATH_TO_TPM, 'TPM.txt')) as f: for line in f: line = line.strip() if line.startswith('AT'): lst = line.split('\t') gene_ids.append(lst[0]) assert len(gene_ids) == 33602 # Assemble gene expressions g = {} # {'run':{'g1':1, 'g2':2}} def populate_dictionary(d, lines): line = lines[0].strip() runs = line.split('\t')[1:] for line in lines[1:]: line = line.strip() lst = line.split('\t') gene = lst[0] expression_levels = lst[1:] run_index = 0 for x in expression_levels: run = runs[run_index] if not run in d: d[run] = {} d[run][gene] = x run_index += 1 for filename in tpm_files_to_include: print('Assemble ' + filename) if filename.endswith('txt'): with open(filename) as f: lines = f.readlines() populate_dictionary(g, lines) elif filename.endswith('gz'): with gzip.open(filename, 'rt') as f: lines = f.readlines() populate_dictionary(g, lines) # Write to TPM.assemble.number.txt run_ids_sorted = sorted(list(run_ids)) assemble_dir = os.path.join(PATH_TO_TPM, 'assemble') if not os.path.exists(assemble_dir): os.mkdir(assemble_dir) fname = os.path.join(assemble_dir, 'assemble.%d.txt' % (len(run_ids))) print(f'Write to {fname}') with open(fname, 'w') as f: # write first line lst1 = ['gene_id'] + run_ids_sorted f.write('\t'.join(lst1) + '\n') for gene in gene_ids: lst2 = [gene] for run in run_ids_sorted: if gene in g[run]: lst2.append(g[run][gene]) else: lst2.append('NA') f.write('\t'.join(lst2) + '\n') assert len(lst1) == len(lst2)