From 21236c3787200eb112d07618af683cf63ef60b24 Mon Sep 17 00:00:00 2001
From: Hui Lan <lanhui@zjnu.edu.cn>
Date: Thu, 8 Aug 2024 17:18:28 +0800
Subject: Improve memory efficency while building TPM.txt

---
 Code/buildRmatrix.py       | 93 +++++++++++++++++-----------------------------
 Code/test_append_column.py |  3 ++
 Code/utils.py              | 62 +++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 59 deletions(-)
 create mode 100644 Code/test_append_column.py

(limited to 'Code')

diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py
index 1c90eb0..bb5fdfb 100644
--- a/Code/buildRmatrix.py
+++ b/Code/buildRmatrix.py
@@ -1,5 +1,4 @@
 # Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt
-#        Edit the variable TPM_TABLE for a different output file name.
 #        Watch out NA values in TPM.txt, these genes don't have any gene expression information.
 #
 # Purpose: make a TPM table, where each row is a gene, and each column is an experiment.  The column name is RNA-seq experiment ID.
@@ -11,9 +10,9 @@
 
 import os, sys, glob
 from configure import TPM_FILE
+from utils import write_first_column, append_column_fast
 
-TPM_TABLE = TPM_FILE
-WARN_NA   = False
+WARN_NA = False
 
 ####################################
 GLB_PARAM_SYMBOL    = '%%'
@@ -88,63 +87,28 @@ def get_max_expressed_isoform_save_space(g, d):
     return '%4.2f' % max(lst)
 
 
-def save_TPM_table(gene_lst, dict_lst, fname):
+def make_gene_expression_list(gene_lst, d):
     '''
     gene_lst: a list of genes
-    dict_lst: a list of dictionaries.  Each dictionary contains gene expression inforamtion.  What is the detailed data structure of each dictionary?
-    fname: where the gene expression level matrix will be saved.
+    dict: each dictionary contains gene expression inforamtion.
     '''
-    
-    dir_name = os.path.dirname(fname)
-    if not os.path.isdir(dir_name):
-        os.makedirs(dir_name)
-        
-    if len(dict_lst) == 0:
-        print('buildRmatrix.py: dict_lst is empty. Nothing to build.')
-        sys.exit()
-
-    f = open(fname, 'w')
-    head = 'gene_id'
-    #print('Merge %d tables.' % (len(dict_lst)))
-    for d in dict_lst:
-        head += '\t' + d['ID'] # d['ID'] is the RNA-seq samples's SRA id
-    f.write('%s\n' % (head))
-    total_count = 0 # number of total gene expression levels
+    result = []
     bad_count = 0   # number of NA gene expression levels.  We wish this number to be far smaller than total_count.
 
-    missed_genes = {}
     for g in gene_lst:
-        s = g
-        for d in dict_lst:
-            if g in d['isoform']:
-                v = d['isoform'][g]
-            else:
-                v = '-9'
-            total_count += 1
-            if v != '-9':
-                s += '\t' + v
-            else:
-                if WARN_NA:
-                    print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
-                s += '\t' + 'NA'
-                bad_count += 1
-                missed_genes[g] = 1
-        f.write('%s\n' % (s))
-    f.close()
-
-    if 1.0 * bad_count / total_count > 0.0: 
-        print('WARNING [buildRmatrix.py]: %s contains NA values!\n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (fname, bad_count, total_count, 100.0* bad_count/total_count, len(missed_genes)))
-
+        v = 'NA'
+        if g in d['isoform']:
+            v = d['isoform'][g]
+        else:
+            bad_count += 1
+            if WARN_NA:
+                print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
+        result.append(v)
 
-def get_dict_list(d):
-    ''' A list of dictionaries, each element for one RNA-seq data '''
-    dlst = []
-    for myid in d['ID_LIST']:
-        if myid in d:
-            fname = d[myid]['LOCATION']
-            d2 = make_expression_dict(fname, myid)
-            dlst.append(d2)
-    return dlst
+    total_count = len(gene_lst)
+    if 1.0 * bad_count / total_count > 0.0 and WARN_NA: 
+        print('WARNING [buildRmatrix.py]: %s contains NA values! \n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (d['ID'], bad_count, total_count, 100.0* bad_count/total_count, bad_count))
+    return result
 
 
 def get_gene_list(fname):
@@ -179,7 +143,7 @@ def make_data_dict(fname):
     Return a dictionary which looks like
     
     {
-      'ID_LIST': [],
+      'RNASEQ_ID_LIST': [],
       'SRR1':
         {
           'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt
@@ -187,7 +151,7 @@ def make_data_dict(fname):
     }
 
     '''
-    d = {'ID_LIST':[]} # ID_LIST is a list of RNA-seq experiment IDs
+    d = {'RNASEQ_ID_LIST':[]} # RNASEQ_ID_LIST is a list of RNA-seq experiment IDs
     f = open(fname)
     lines = f.readlines()
     f.close()
@@ -202,7 +166,7 @@ def make_data_dict(fname):
                 print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s))
                 sys.exit()
             d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
-            d['ID_LIST'].append(s)
+            d['RNASEQ_ID_LIST'].append(s)
         if line.startswith('DESCRIPTION:'):
             d[s]['DESCRIPTION'] = get_value(line, ':')
         elif line.startswith('DATA_FORMAT:'):
@@ -234,10 +198,21 @@ def make_global_param_dict(fname):
     f.close()
     return d
 
+
+
 ## main
 param_file = sys.argv[1]
 global_param_dict = make_global_param_dict(param_file)
+gene_lst = get_gene_list(global_param_dict['GENE_LIST'])
 data_dict = make_data_dict(param_file)
-TPM_TABLE = os.path.abspath(TPM_TABLE)
-save_TPM_table(get_gene_list(global_param_dict['GENE_LIST']), get_dict_list(data_dict), TPM_TABLE)
-
+TPM_FILE = os.path.abspath(TPM_FILE)
+write_first_column(['gene_id'] + gene_lst, TPM_FILE)
+num_rnaseq = len(data_dict['RNASEQ_ID_LIST'])
+print('[buildRmatrix.py] Total RNA-seq data %d' % num_rnaseq)
+total = 0
+for experiment_id in data_dict['RNASEQ_ID_LIST']:
+    total += 1
+    print('\rNow appending %s (%4.2f%% finished)' % (experiment_id, 100*total/num_rnaseq), end='')
+    d = make_expression_dict(data_dict[experiment_id]['LOCATION'], experiment_id)
+    gene_expression_lst = make_gene_expression_list(gene_lst, d)
+    result = append_column_fast(TPM_FILE, [experiment_id] + gene_expression_lst)
diff --git a/Code/test_append_column.py b/Code/test_append_column.py
new file mode 100644
index 0000000..77d54cc
--- /dev/null
+++ b/Code/test_append_column.py
@@ -0,0 +1,3 @@
+from utils import append_column_fast2
+result = append_column_fast2('../Data/temp/1.txt', ['4','5','6'])
+print(result)
diff --git a/Code/utils.py b/Code/utils.py
index 2451b91..d4bb3e6 100644
--- a/Code/utils.py
+++ b/Code/utils.py
@@ -3,6 +3,8 @@
 # Created by Hui on 20 July 2021
 
 import os
+import shutil
+import mmap
 from datetime import datetime
 
 def get_edge_set(fname):
@@ -43,6 +45,66 @@ def make_paths(s):
         os.makedirs(s)
 
 
+def write_first_column(lst, fname):
+    with open(fname, 'w') as f:
+        for x in lst:
+            f.write(x + '\n')
+
+
+def append_column_fast(basefile, col):
+    ''' Append col to basefile.  If basefile does not exist, then create it and col will be basefile's first column.'''
+    if not os.path.exists(basefile):
+        with open(basefile, 'w') as f:
+            count = 0
+            for x in col:
+                f.write(x + '\n')
+                count += 1
+            return count
+
+    with open(basefile) as f:
+        lines = f.readlines()
+        if len(lines) != len(col):
+            return
+        
+    count = 0
+    with open(basefile, 'w') as f:
+        for line in lines:
+            line = line.strip()
+            new_line = line + '\t' + col[count] + '\n'
+            count += 1
+            f.write(new_line)
+
+    return count
+
+
+def append_column_fast2(basefile, col):
+    ''' Append col to basefile.  If basefile does not exist, then create it and col will be basefile's first column.'''
+    if not os.path.exists(basefile):
+        with open(basefile, 'w') as f:
+            count = 0
+            for x in col:
+                f.write(x + '\n')
+                count += 1
+            return count
+
+    with open(basefile) as f:
+        with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_f:
+            lines = mmap_f.read().split(b'\n')[:-1]
+            if len(lines) != len(col):
+                return
+        
+    count = 0
+    content = ''
+    with open(basefile, 'w') as f:
+        for line in lines:
+            line = line.decode().strip()
+            new_line = line + '\t' + col[count] + '\n'
+            content += new_line
+            count += 1
+        f.write(content)
+    return count
+
+
 if __name__ == '__main__':
     S2 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt')
     S1 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt.old')
-- 
cgit v1.2.1