summaryrefslogtreecommitdiff
path: root/Code/buildRmatrix.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/buildRmatrix.py')
-rw-r--r--Code/buildRmatrix.py23
1 files changed, 12 insertions, 11 deletions
diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py
index 0dfc569..c671f2a 100644
--- a/Code/buildRmatrix.py
+++ b/Code/buildRmatrix.py
@@ -5,8 +5,9 @@
# Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID.
#
# 23 Dec 2016, hui, slcu
-# Last modified 5 Apr 2017, hui, slcu
+# Last modified 5 Apr 2017, hui, slcu
# Last modified 25 Oct 2019, hui, zjnu [Comments; add a variable WARN_NA to turn on/off print NA warnings.]
+# Last modified 10 Oct 2020, hui, zjnu [note that if there are more than 1000 RNA-seq samples, this script requires at least 7GB memory to run.]
import os, sys, glob
@@ -71,19 +72,19 @@ def make_expression_dict(fname, myid):
else:
d['isoform'][common].append(tpm)
- # make the dictionary smaller, so it requires less memory. Cut from 7.4G to 6.9G for 930 TPM files.
+ # make the dictionary smaller by using a string instead of a double-precision float number, so it requires less memory. Cut from 7.44G to 6.5G for 1003 TPM files.
for g in d['isoform']:
- d['isoform'][g] = [get_max_expressed_isoform(g, d)]
+ d['isoform'][g] = '%4.2f' % get_max_expressed_isoform(g, d)
return d
-def get_max_expressed_isoform(g, d):
+def get_max_expressed_isoform_save_space(g, d):
+ ''' Evloved from get_max_expressed_isoform(g, d) '''
if not g in d['isoform']:
- return -9
- lst = d['isoform'][g]
- return max(lst)
-
+ return '-9'
+ return d['isoform'][g]
+
def save_TPM_table(gene_lst, dict_lst, fname):
'''
@@ -113,10 +114,10 @@ def save_TPM_table(gene_lst, dict_lst, fname):
for g in gene_lst:
s = g
for d in dict_lst:
- v = get_max_expressed_isoform(g, d)
+ v = get_max_expressed_isoform_save_space(g, d)
total_count += 1
- if v != -9:
- s += '\t' + '%4.2f' % (v)
+ if v != '-9':
+ s += '\t' + v
else:
if WARN_NA:
print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))