summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2024-08-08 17:18:28 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2024-08-08 17:18:28 +0800
commit21236c3787200eb112d07618af683cf63ef60b24 (patch)
treefea27b60a8e7580379252ec36fcb4bd531b98dd9
parente5593827fbc6c35e1df6b7ab1131b0717130219c (diff)
Improve memory efficency while building TPM.txt
-rw-r--r--Code/buildRmatrix.py93
-rw-r--r--Code/test_append_column.py3
-rw-r--r--Code/utils.py62
3 files changed, 99 insertions, 59 deletions
diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py
index 1c90eb0..bb5fdfb 100644
--- a/Code/buildRmatrix.py
+++ b/Code/buildRmatrix.py
@@ -1,5 +1,4 @@
# Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt
-# Edit the variable TPM_TABLE for a different output file name.
# Watch out NA values in TPM.txt, these genes don't have any gene expression information.
#
# Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID.
@@ -11,9 +10,9 @@
import os, sys, glob
from configure import TPM_FILE
+from utils import write_first_column, append_column_fast
-TPM_TABLE = TPM_FILE
-WARN_NA = False
+WARN_NA = False
####################################
GLB_PARAM_SYMBOL = '%%'
@@ -88,63 +87,28 @@ def get_max_expressed_isoform_save_space(g, d):
return '%4.2f' % max(lst)
-def save_TPM_table(gene_lst, dict_lst, fname):
+def make_gene_expression_list(gene_lst, d):
'''
gene_lst: a list of genes
- dict_lst: a list of dictionaries. Each dictionary contains gene expression inforamtion. What is the detailed data structure of each dictionary?
- fname: where the gene expression level matrix will be saved.
+ dict: each dictionary contains gene expression inforamtion.
'''
-
- dir_name = os.path.dirname(fname)
- if not os.path.isdir(dir_name):
- os.makedirs(dir_name)
-
- if len(dict_lst) == 0:
- print('buildRmatrix.py: dict_lst is empty. Nothing to build.')
- sys.exit()
-
- f = open(fname, 'w')
- head = 'gene_id'
- #print('Merge %d tables.' % (len(dict_lst)))
- for d in dict_lst:
- head += '\t' + d['ID'] # d['ID'] is the RNA-seq samples's SRA id
- f.write('%s\n' % (head))
- total_count = 0 # number of total gene expression levels
+ result = []
bad_count = 0 # number of NA gene expression levels. We wish this number to be far smaller than total_count.
- missed_genes = {}
for g in gene_lst:
- s = g
- for d in dict_lst:
- if g in d['isoform']:
- v = d['isoform'][g]
- else:
- v = '-9'
- total_count += 1
- if v != '-9':
- s += '\t' + v
- else:
- if WARN_NA:
- print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
- s += '\t' + 'NA'
- bad_count += 1
- missed_genes[g] = 1
- f.write('%s\n' % (s))
- f.close()
-
- if 1.0 * bad_count / total_count > 0.0:
- print('WARNING [buildRmatrix.py]: %s contains NA values!\n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (fname, bad_count, total_count, 100.0* bad_count/total_count, len(missed_genes)))
-
+ v = 'NA'
+ if g in d['isoform']:
+ v = d['isoform'][g]
+ else:
+ bad_count += 1
+ if WARN_NA:
+ print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
+ result.append(v)
-def get_dict_list(d):
- ''' A list of dictionaries, each element for one RNA-seq data '''
- dlst = []
- for myid in d['ID_LIST']:
- if myid in d:
- fname = d[myid]['LOCATION']
- d2 = make_expression_dict(fname, myid)
- dlst.append(d2)
- return dlst
+ total_count = len(gene_lst)
+ if 1.0 * bad_count / total_count > 0.0 and WARN_NA:
+ print('WARNING [buildRmatrix.py]: %s contains NA values! \n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (d['ID'], bad_count, total_count, 100.0* bad_count/total_count, bad_count))
+ return result
def get_gene_list(fname):
@@ -179,7 +143,7 @@ def make_data_dict(fname):
Return a dictionary which looks like
{
- 'ID_LIST': [],
+ 'RNASEQ_ID_LIST': [],
'SRR1':
{
'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt
@@ -187,7 +151,7 @@ def make_data_dict(fname):
}
'''
- d = {'ID_LIST':[]} # ID_LIST is a list of RNA-seq experiment IDs
+ d = {'RNASEQ_ID_LIST':[]} # RNASEQ_ID_LIST is a list of RNA-seq experiment IDs
f = open(fname)
lines = f.readlines()
f.close()
@@ -202,7 +166,7 @@ def make_data_dict(fname):
print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s))
sys.exit()
d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
- d['ID_LIST'].append(s)
+ d['RNASEQ_ID_LIST'].append(s)
if line.startswith('DESCRIPTION:'):
d[s]['DESCRIPTION'] = get_value(line, ':')
elif line.startswith('DATA_FORMAT:'):
@@ -234,10 +198,21 @@ def make_global_param_dict(fname):
f.close()
return d
+
+
## main
param_file = sys.argv[1]
global_param_dict = make_global_param_dict(param_file)
+gene_lst = get_gene_list(global_param_dict['GENE_LIST'])
data_dict = make_data_dict(param_file)
-TPM_TABLE = os.path.abspath(TPM_TABLE)
-save_TPM_table(get_gene_list(global_param_dict['GENE_LIST']), get_dict_list(data_dict), TPM_TABLE)
-
+TPM_FILE = os.path.abspath(TPM_FILE)
+write_first_column(['gene_id'] + gene_lst, TPM_FILE)
+num_rnaseq = len(data_dict['RNASEQ_ID_LIST'])
+print('[buildRmatrix.py] Total RNA-seq data %d' % num_rnaseq)
+total = 0
+for experiment_id in data_dict['RNASEQ_ID_LIST']:
+ total += 1
+ print('\rNow appending %s (%4.2f%% finished)' % (experiment_id, 100*total/num_rnaseq), end='')
+ d = make_expression_dict(data_dict[experiment_id]['LOCATION'], experiment_id)
+ gene_expression_lst = make_gene_expression_list(gene_lst, d)
+ result = append_column_fast(TPM_FILE, [experiment_id] + gene_expression_lst)
diff --git a/Code/test_append_column.py b/Code/test_append_column.py
new file mode 100644
index 0000000..77d54cc
--- /dev/null
+++ b/Code/test_append_column.py
@@ -0,0 +1,3 @@
+from utils import append_column_fast2
+result = append_column_fast2('../Data/temp/1.txt', ['4','5','6'])
+print(result)
diff --git a/Code/utils.py b/Code/utils.py
index 2451b91..d4bb3e6 100644
--- a/Code/utils.py
+++ b/Code/utils.py
@@ -3,6 +3,8 @@
# Created by Hui on 20 July 2021
import os
+import shutil
+import mmap
from datetime import datetime
def get_edge_set(fname):
@@ -43,6 +45,66 @@ def make_paths(s):
os.makedirs(s)
+def write_first_column(lst, fname):
+ with open(fname, 'w') as f:
+ for x in lst:
+ f.write(x + '\n')
+
+
+def append_column_fast(basefile, col):
+ ''' Append col to basefile. If basefile does not exist, then create it and col will be basefile's first column.'''
+ if not os.path.exists(basefile):
+ with open(basefile, 'w') as f:
+ count = 0
+ for x in col:
+ f.write(x + '\n')
+ count += 1
+ return count
+
+ with open(basefile) as f:
+ lines = f.readlines()
+ if len(lines) != len(col):
+ return
+
+ count = 0
+ with open(basefile, 'w') as f:
+ for line in lines:
+ line = line.strip()
+ new_line = line + '\t' + col[count] + '\n'
+ count += 1
+ f.write(new_line)
+
+ return count
+
+
+def append_column_fast2(basefile, col):
+ ''' Append col to basefile. If basefile does not exist, then create it and col will be basefile's first column.'''
+ if not os.path.exists(basefile):
+ with open(basefile, 'w') as f:
+ count = 0
+ for x in col:
+ f.write(x + '\n')
+ count += 1
+ return count
+
+ with open(basefile) as f:
+ with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_f:
+ lines = mmap_f.read().split(b'\n')[:-1]
+ if len(lines) != len(col):
+ return
+
+ count = 0
+ content = ''
+ with open(basefile, 'w') as f:
+ for line in lines:
+ line = line.decode().strip()
+ new_line = line + '\t' + col[count] + '\n'
+ content += new_line
+ count += 1
+ f.write(content)
+ return count
+
+
if __name__ == '__main__':
S2 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt')
S1 = get_edge_set('/home/lanhui/brain/Data/temp/edges.txt.old')