diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2024-08-08 17:18:28 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2024-08-08 17:18:28 +0800 |
commit | 21236c3787200eb112d07618af683cf63ef60b24 (patch) | |
tree | fea27b60a8e7580379252ec36fcb4bd531b98dd9 | |
parent | e5593827fbc6c35e1df6b7ab1131b0717130219c (diff) |
Improve memory efficency while building TPM.txt
-rw-r--r-- | Code/buildRmatrix.py | 93 | ||||
-rw-r--r-- | Code/test_append_column.py | 3 | ||||
-rw-r--r-- | Code/utils.py | 62 |
3 files changed, 99 insertions, 59 deletions
diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py index 1c90eb0..bb5fdfb 100644 --- a/Code/buildRmatrix.py +++ b/Code/buildRmatrix.py @@ -1,5 +1,4 @@ # Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt
-# Edit the variable TPM_TABLE for a different output file name.
# Watch out NA values in TPM.txt, these genes don't have any gene expression information.
#
# Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID.
@@ -11,9 +10,9 @@ import os, sys, glob
from configure import TPM_FILE
+from utils import write_first_column, append_column_fast
-TPM_TABLE = TPM_FILE
-WARN_NA = False
+WARN_NA = False
####################################
GLB_PARAM_SYMBOL = '%%'
@@ -88,63 +87,28 @@ def get_max_expressed_isoform_save_space(g, d): return '%4.2f' % max(lst)
-def save_TPM_table(gene_lst, dict_lst, fname):
+def make_gene_expression_list(gene_lst, d):
'''
gene_lst: a list of genes
- dict_lst: a list of dictionaries. Each dictionary contains gene expression inforamtion. What is the detailed data structure of each dictionary?
- fname: where the gene expression level matrix will be saved.
+ dict: each dictionary contains gene expression inforamtion.
'''
-
- dir_name = os.path.dirname(fname)
- if not os.path.isdir(dir_name):
- os.makedirs(dir_name)
-
- if len(dict_lst) == 0:
- print('buildRmatrix.py: dict_lst is empty. Nothing to build.')
- sys.exit()
-
- f = open(fname, 'w')
- head = 'gene_id'
- #print('Merge %d tables.' % (len(dict_lst)))
- for d in dict_lst:
- head += '\t' + d['ID'] # d['ID'] is the RNA-seq samples's SRA id
- f.write('%s\n' % (head))
- total_count = 0 # number of total gene expression levels
+ result = []
bad_count = 0 # number of NA gene expression levels. We wish this number to be far smaller than total_count.
- missed_genes = {}
for g in gene_lst:
- s = g
- for d in dict_lst:
- if g in d['isoform']:
- v = d['isoform'][g]
- else:
- v = '-9'
- total_count += 1
- if v != '-9':
- s += '\t' + v
- else:
- if WARN_NA:
- print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
- s += '\t' + 'NA'
- bad_count += 1
- missed_genes[g] = 1
- f.write('%s\n' % (s))
- f.close()
-
- if 1.0 * bad_count / total_count > 0.0:
- print('WARNING [buildRmatrix.py]: %s contains NA values!\n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (fname, bad_count, total_count, 100.0* bad_count/total_count, len(missed_genes)))
-
+ v = 'NA'
+ if g in d['isoform']:
+ v = d['isoform'][g]
+ else:
+ bad_count += 1
+ if WARN_NA:
+ print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
+ result.append(v)
-def get_dict_list(d):
- ''' A list of dictionaries, each element for one RNA-seq data '''
- dlst = []
- for myid in d['ID_LIST']:
- if myid in d:
- fname = d[myid]['LOCATION']
- d2 = make_expression_dict(fname, myid)
- dlst.append(d2)
- return dlst
+ total_count = len(gene_lst)
+ if 1.0 * bad_count / total_count > 0.0 and WARN_NA:
+ print('WARNING [buildRmatrix.py]: %s contains NA values! \n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (d['ID'], bad_count, total_count, 100.0* bad_count/total_count, bad_count))
+ return result
def get_gene_list(fname):
@@ -179,7 +143,7 @@ def make_data_dict(fname): Return a dictionary which looks like
{
- 'ID_LIST': [],
+ 'RNASEQ_ID_LIST': [],
'SRR1':
{
'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt
@@ -187,7 +151,7 @@ def make_data_dict(fname): }
'''
- d = {'ID_LIST':[]} # ID_LIST is a list of RNA-seq experiment IDs
+ d = {'RNASEQ_ID_LIST':[]} # RNASEQ_ID_LIST is a list of RNA-seq experiment IDs
f = open(fname)
lines = f.readlines()
f.close()
@@ -202,7 +166,7 @@ def make_data_dict(fname): print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s))
sys.exit()
d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
- d['ID_LIST'].append(s)
+ d['RNASEQ_ID_LIST'].append(s)
if line.startswith('DESCRIPTION:'):
d[s]['DESCRIPTION'] = get_value(line, ':')
elif line.startswith('DATA_FORMAT:'):
@@ -234,10 +198,21 @@ def make_global_param_dict(fname): f.close()
return d
+
+
## main
param_file = sys.argv[1]
global_param_dict = make_global_param_dict(param_file)
+gene_lst = get_gene_list(global_param_dict['GENE_LIST'])
data_dict = make_data_dict(param_file)
-TPM_TABLE = os.path.abspath(TPM_TABLE)
-save_TPM_table(get_gene_list(global_param_dict['GENE_LIST']), get_dict_list(data_dict), TPM_TABLE)
-
+TPM_FILE = os.path.abspath(TPM_FILE)
+write_first_column(['gene_id'] + gene_lst, TPM_FILE)
+num_rnaseq = len(data_dict['RNASEQ_ID_LIST'])
+print('[buildRmatrix.py] Total RNA-seq data %d' % num_rnaseq)
+total = 0
+for experiment_id in data_dict['RNASEQ_ID_LIST']:
+ total += 1
+ print('\rNow appending %s (%4.2f%% finished)' % (experiment_id, 100*total/num_rnaseq), end='')
+ d = make_expression_dict(data_dict[experiment_id]['LOCATION'], experiment_id)
+ gene_expression_lst = make_gene_expression_list(gene_lst, d)
+ result = append_column_fast(TPM_FILE, [experiment_id] + gene_expression_lst)
diff --git a/Code/test_append_column.py b/Code/test_append_column.py new file mode 100644 index 0000000..77d54cc --- /dev/null +++ b/Code/test_append_column.py @@ -0,0 +1,3 @@ +from utils import append_column_fast2 +result = append_column_fast2('../Data/temp/1.txt', ['4','5','6']) +print(result) diff --git a/Code/utils.py b/Code/utils.py index 2451b91..d4bb3e6 100644 --- a/Code/utils.py +++ b/Code/utils.py @@ -3,6 +3,8 @@ # Created by Hui on 20 July 2021 import os +import shutil +import mmap from datetime import datetime def get_edge_set(fname): @@ -43,6 +45,66 @@ def make_paths(s): os.makedirs(s) +def write_first_column(lst, fname): + with open(fname, 'w') as f: + for x in lst: + f.write(x + '\n') + + +def append_column_fast(basefile, col): + ''' Append col to basefile. If basefile does not exist, then create it and col will be basefile's first column.''' + if not os.path.exists(basefile): + with open(basefile, 'w') as f: + count = 0 + for x in col: + f.write(x + '\n') + count += 1 + return count + + with open(basefile) as f: + lines = f.readlines() + if len(lines) != len(col): + return + + count = 0 + with open(basefile, 'w') as f: + for line in lines: + line = line.strip() + new_line = line + '\t' + col[count] + '\n' + count += 1 + f.write(new_line) + + return count + + +def append_column_fast2(basefile, col): + ''' Append col to basefile. If basefile does not exist, then create it and col will be basefile's first column.''' + if not os.path.exists(basefile): + with open(basefile, 'w') as f: + count = 0 + for x in col: + f.write(x + '\n') + count += 1 + return count + + with open(basefile) as f: + with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_f: + lines = mmap_f.read().split(b'\n')[:-1] + if len(lines) != len(col): + return + + count = 0 + content = '' + with open(basefile, 'w') as f: + for line in lines: + line = line.decode().strip() + new_line = line + '\t' + col[count] + '\n' + content += new_line + count += 1 + f.write(content) + return count + + if __name__ == '__main__': S2 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt') S1 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt.old') |