From d5fd834de918f248107593b73e42af380886cb2d Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Sat, 10 Oct 2020 19:39:25 +0800 Subject: buildRmatrix.py: use string instead of float to store TPM values. This script could run out of memory space (7GB) when there are more than 1000 RNA-seq samples. --- Code/buildRmatrix.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'Code/buildRmatrix.py') diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py index 0dfc569..c671f2a 100644 --- a/Code/buildRmatrix.py +++ b/Code/buildRmatrix.py @@ -5,8 +5,9 @@ # Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID. # # 23 Dec 2016, hui, slcu -# Last modified 5 Apr 2017, hui, slcu +# Last modified 5 Apr 2017, hui, slcu # Last modified 25 Oct 2019, hui, zjnu [Comments; add a variable WARN_NA to turn on/off print NA warnings.] +# Last modified 10 Oct 2020, hui, zjnu [note that if there are more than 1000 RNA-seq samples, this script requires at least 7GB memory to run.] import os, sys, glob @@ -71,19 +72,19 @@ def make_expression_dict(fname, myid): else: d['isoform'][common].append(tpm) - # make the dictionary smaller, so it requires less memory. Cut from 7.4G to 6.9G for 930 TPM files. + # make the dictionary smaller by using a string instead of a double-precision float number, so it requires less memory. Cut from 7.44G to 6.5G for 1003 TPM files. for g in d['isoform']: - d['isoform'][g] = [get_max_expressed_isoform(g, d)] + d['isoform'][g] = '%4.2f' % get_max_expressed_isoform(g, d) return d -def get_max_expressed_isoform(g, d): +def get_max_expressed_isoform_save_space(g, d): + ''' Evloved from get_max_expressed_isoform(g, d) ''' if not g in d['isoform']: - return -9 - lst = d['isoform'][g] - return max(lst) - + return '-9' + return d['isoform'][g] + def save_TPM_table(gene_lst, dict_lst, fname): ''' @@ -113,10 +114,10 @@ def save_TPM_table(gene_lst, dict_lst, fname): for g in gene_lst: s = g for d in dict_lst: - v = get_max_expressed_isoform(g, d) + v = get_max_expressed_isoform_save_space(g, d) total_count += 1 - if v != -9: - s += '\t' + '%4.2f' % (v) + if v != '-9': + s += '\t' + v else: if WARN_NA: print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID'])) -- cgit v1.2.1