From 21236c3787200eb112d07618af683cf63ef60b24 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Thu, 8 Aug 2024 17:18:28 +0800 Subject: Improve memory efficency while building TPM.txt --- Code/buildRmatrix.py | 93 +++++++++++++++++----------------------------- Code/test_append_column.py | 3 ++ Code/utils.py | 62 +++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 59 deletions(-) create mode 100644 Code/test_append_column.py (limited to 'Code') diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py index 1c90eb0..bb5fdfb 100644 --- a/Code/buildRmatrix.py +++ b/Code/buildRmatrix.py @@ -1,5 +1,4 @@ # Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt -# Edit the variable TPM_TABLE for a different output file name. # Watch out NA values in TPM.txt, these genes don't have any gene expression information. # # Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID. @@ -11,9 +10,9 @@ import os, sys, glob from configure import TPM_FILE +from utils import write_first_column, append_column_fast -TPM_TABLE = TPM_FILE -WARN_NA = False +WARN_NA = False #################################### GLB_PARAM_SYMBOL = '%%' @@ -88,63 +87,28 @@ def get_max_expressed_isoform_save_space(g, d): return '%4.2f' % max(lst) -def save_TPM_table(gene_lst, dict_lst, fname): +def make_gene_expression_list(gene_lst, d): ''' gene_lst: a list of genes - dict_lst: a list of dictionaries. Each dictionary contains gene expression inforamtion. What is the detailed data structure of each dictionary? - fname: where the gene expression level matrix will be saved. + dict: each dictionary contains gene expression inforamtion. ''' - - dir_name = os.path.dirname(fname) - if not os.path.isdir(dir_name): - os.makedirs(dir_name) - - if len(dict_lst) == 0: - print('buildRmatrix.py: dict_lst is empty. Nothing to build.') - sys.exit() - - f = open(fname, 'w') - head = 'gene_id' - #print('Merge %d tables.' % (len(dict_lst))) - for d in dict_lst: - head += '\t' + d['ID'] # d['ID'] is the RNA-seq samples's SRA id - f.write('%s\n' % (head)) - total_count = 0 # number of total gene expression levels + result = [] bad_count = 0 # number of NA gene expression levels. We wish this number to be far smaller than total_count. - missed_genes = {} for g in gene_lst: - s = g - for d in dict_lst: - if g in d['isoform']: - v = d['isoform'][g] - else: - v = '-9' - total_count += 1 - if v != '-9': - s += '\t' + v - else: - if WARN_NA: - print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID'])) - s += '\t' + 'NA' - bad_count += 1 - missed_genes[g] = 1 - f.write('%s\n' % (s)) - f.close() - - if 1.0 * bad_count / total_count > 0.0: - print('WARNING [buildRmatrix.py]: %s contains NA values!\n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (fname, bad_count, total_count, 100.0* bad_count/total_count, len(missed_genes))) - + v = 'NA' + if g in d['isoform']: + v = d['isoform'][g] + else: + bad_count += 1 + if WARN_NA: + print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID'])) + result.append(v) -def get_dict_list(d): - ''' A list of dictionaries, each element for one RNA-seq data ''' - dlst = [] - for myid in d['ID_LIST']: - if myid in d: - fname = d[myid]['LOCATION'] - d2 = make_expression_dict(fname, myid) - dlst.append(d2) - return dlst + total_count = len(gene_lst) + if 1.0 * bad_count / total_count > 0.0 and WARN_NA: + print('WARNING [buildRmatrix.py]: %s contains NA values! \n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (d['ID'], bad_count, total_count, 100.0* bad_count/total_count, bad_count)) + return result def get_gene_list(fname): @@ -179,7 +143,7 @@ def make_data_dict(fname): Return a dictionary which looks like { - 'ID_LIST': [], + 'RNASEQ_ID_LIST': [], 'SRR1': { 'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt @@ -187,7 +151,7 @@ def make_data_dict(fname): } ''' - d = {'ID_LIST':[]} # ID_LIST is a list of RNA-seq experiment IDs + d = {'RNASEQ_ID_LIST':[]} # RNASEQ_ID_LIST is a list of RNA-seq experiment IDs f = open(fname) lines = f.readlines() f.close() @@ -202,7 +166,7 @@ def make_data_dict(fname): print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s)) sys.exit() d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} - d['ID_LIST'].append(s) + d['RNASEQ_ID_LIST'].append(s) if line.startswith('DESCRIPTION:'): d[s]['DESCRIPTION'] = get_value(line, ':') elif line.startswith('DATA_FORMAT:'): @@ -234,10 +198,21 @@ def make_global_param_dict(fname): f.close() return d + + ## main param_file = sys.argv[1] global_param_dict = make_global_param_dict(param_file) +gene_lst = get_gene_list(global_param_dict['GENE_LIST']) data_dict = make_data_dict(param_file) -TPM_TABLE = os.path.abspath(TPM_TABLE) -save_TPM_table(get_gene_list(global_param_dict['GENE_LIST']), get_dict_list(data_dict), TPM_TABLE) - +TPM_FILE = os.path.abspath(TPM_FILE) +write_first_column(['gene_id'] + gene_lst, TPM_FILE) +num_rnaseq = len(data_dict['RNASEQ_ID_LIST']) +print('[buildRmatrix.py] Total RNA-seq data %d' % num_rnaseq) +total = 0 +for experiment_id in data_dict['RNASEQ_ID_LIST']: + total += 1 + print('\rNow appending %s (%4.2f%% finished)' % (experiment_id, 100*total/num_rnaseq), end='') + d = make_expression_dict(data_dict[experiment_id]['LOCATION'], experiment_id) + gene_expression_lst = make_gene_expression_list(gene_lst, d) + result = append_column_fast(TPM_FILE, [experiment_id] + gene_expression_lst) diff --git a/Code/test_append_column.py b/Code/test_append_column.py new file mode 100644 index 0000000..77d54cc --- /dev/null +++ b/Code/test_append_column.py @@ -0,0 +1,3 @@ +from utils import append_column_fast2 +result = append_column_fast2('../Data/temp/1.txt', ['4','5','6']) +print(result) diff --git a/Code/utils.py b/Code/utils.py index 2451b91..d4bb3e6 100644 --- a/Code/utils.py +++ b/Code/utils.py @@ -3,6 +3,8 @@ # Created by Hui on 20 July 2021 import os +import shutil +import mmap from datetime import datetime def get_edge_set(fname): @@ -43,6 +45,66 @@ def make_paths(s): os.makedirs(s) +def write_first_column(lst, fname): + with open(fname, 'w') as f: + for x in lst: + f.write(x + '\n') + + +def append_column_fast(basefile, col): + ''' Append col to basefile. If basefile does not exist, then create it and col will be basefile's first column.''' + if not os.path.exists(basefile): + with open(basefile, 'w') as f: + count = 0 + for x in col: + f.write(x + '\n') + count += 1 + return count + + with open(basefile) as f: + lines = f.readlines() + if len(lines) != len(col): + return + + count = 0 + with open(basefile, 'w') as f: + for line in lines: + line = line.strip() + new_line = line + '\t' + col[count] + '\n' + count += 1 + f.write(new_line) + + return count + + +def append_column_fast2(basefile, col): + ''' Append col to basefile. If basefile does not exist, then create it and col will be basefile's first column.''' + if not os.path.exists(basefile): + with open(basefile, 'w') as f: + count = 0 + for x in col: + f.write(x + '\n') + count += 1 + return count + + with open(basefile) as f: + with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_f: + lines = mmap_f.read().split(b'\n')[:-1] + if len(lines) != len(col): + return + + count = 0 + content = '' + with open(basefile, 'w') as f: + for line in lines: + line = line.decode().strip() + new_line = line + '\t' + col[count] + '\n' + content += new_line + count += 1 + f.write(content) + return count + + if __name__ == '__main__': S2 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt') S1 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt.old') -- cgit v1.2.1