summaryrefslogtreecommitdiff
path: root/Code/buildRmatrix.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/buildRmatrix.py')
-rw-r--r--Code/buildRmatrix.py234
1 files changed, 234 insertions, 0 deletions
diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py
new file mode 100644
index 0000000..b775c77
--- /dev/null
+++ b/Code/buildRmatrix.py
@@ -0,0 +1,234 @@
+# Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt
+# Edit the variable TPM_TABLE for a different output file name.
+# Watch out NA values in TPM.txt, these genes don't have any gene expression information.
+#
+# Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID.
+#
+# 23 Dec 2016, hui, slcu
+# Last modified 5 Apr 2017, hui, slcu
+# Last modified 25 Oct 2019, hui, zjnu [Comments; add a variable WARN_NA to turn on/off print NA warnings.]
+
+import os, sys, glob
+
+#TPM_TABLE = '../Data/history/expr/TPM.txt'
+TPM_TABLE = '../Data/history/expr/TPM.txt'
+WARN_NA = False
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+LCL_PARAM_SYMBOL = '%'
+DATA_SYMBOL = '@'
+####################################
+
+def common_part(s):
+ ''' s is expected to have this form: AT1G01020.1, remove .1 '''
+ s = s.strip()
+ index = s.find('.')
+ if index < 0: # not found, -1
+ return s
+ return s[0:index]
+
+
+def make_expression_dict(fname, myid):
+ '''
+ fname -- salmon file
+ myid -- RNA-seq experiment ID
+
+ The retured value is a dictionary which looks like
+
+ {
+ 'ID': RNA-seq experiment ID
+ 'isoform':
+ {
+ 'AT1G12345': [],
+ 'AT2G12345': [],
+ ...
+ }
+ }
+
+ Each gene ID (e.g., AT1G12345) has a number of isoforms which gives different expression levels.
+ '''
+
+ ID_COL = 0 # Salmon's quant.sf file, first column is gene ID
+ TPM_COL = 3 # Salmon's quant.sf file, fourth column is TPM
+
+ if not os.path.exists(fname):
+ print('ERROR [buildRmatrix.py]: file %s not exists.' % (fname))
+ sys.exit()
+
+ d = {'ID':myid, 'isoform':{}}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]: # ignore head line, Name Length EffectiveLength TPM NumReads
+ line = line.strip()
+ lst = line.split()
+ gene_id = lst[ID_COL]
+ tpm = float(lst[TPM_COL])
+ common = common_part(gene_id) # gene id without .1, .2, etc.
+ if not common in d['isoform']:
+ d['isoform'][common] = [tpm]
+ else:
+ d['isoform'][common].append(tpm)
+
+ return d
+
+
+def get_max_expressed_isoform(g, d):
+ if not g in d['isoform']:
+ return -9
+ lst = d['isoform'][g]
+ return max(lst)
+
+
+def save_TPM_table(gene_lst, dict_lst, fname):
+ '''
+ gene_lst: a list of genes
+ dict_lst: a list of dictionaries. Each dictionary contains gene expression inforamtion. What is the detailed data structure of each dictionary?
+ fname: where the gene expression level matrix will be saved.
+ '''
+
+ dir_name = os.path.dirname(fname)
+ if not os.path.isdir(dir_name):
+ os.makedirs(dir_name)
+
+ if len(dict_lst) == 0:
+ print('buildRmatrix.py: dict_lst is empty. Nothing to build.')
+ sys.exit()
+
+ f = open(fname, 'w')
+ head = 'gene_id'
+ #print('Merge %d tables.' % (len(dict_lst)))
+ for d in dict_lst:
+ head += '\t' + d['ID'] # d['ID'] is the RNA-seq samples's SRA id
+ f.write('%s\n' % (head))
+ total_count = 0 # number of total gene expression levels
+ bad_count = 0 # number of NA gene expression levels. We wish this number to be far smaller than total_count.
+
+ missed_genes = {}
+ for g in gene_lst:
+ s = g
+ for d in dict_lst:
+ v = get_max_expressed_isoform(g, d)
+ total_count += 1
+ if v != -9:
+ s += '\t' + '%4.2f' % (v)
+ else:
+ if WARN_NA:
+ print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
+ s += '\t' + 'NA'
+ bad_count += 1
+ missed_genes[g] = 1
+ f.write('%s\n' % (s))
+ f.close()
+
+ if 1.0 * bad_count / total_count > 0.0:
+ print('WARNING [buildRmatrix.py]: %s contains NA values!\n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (fname, bad_count, total_count, 100.0* bad_count/total_count, len(missed_genes)))
+
+
+def get_dict_list(d):
+ ''' A list of dictionaries, each element for one RNA-seq data '''
+ dlst = []
+ for myid in d['ID_LIST']:
+ if myid in d:
+ fname = d[myid]['LOCATION']
+ d2 = make_expression_dict(fname, myid)
+ dlst.append(d2)
+ return dlst
+
+
+def get_gene_list(fname):
+ f = open(fname)
+ lst = []
+ for line in f:
+ line = line.strip()
+ if line != '':
+ l = line.split()[0]
+ lst.append(l)
+ f.close()
+ return lst
+
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k, v)
+
+
+def get_value(s, delimit):
+ index = s.find(delimit)
+ if index < 0:
+ sys.exit()
+ return s[index+1:].strip()
+
+
+def make_data_dict(fname):
+ '''
+ fname - parameter_for_buildRmatrix.txt
+
+ Return a dictionary which looks like
+
+ {
+ 'ID_LIST': [],
+ 'SRR1':
+ {
+ 'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt
+ }
+ }
+
+ '''
+ d = {'ID_LIST':[]} # ID_LIST is a list of RNA-seq experiment IDs
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s))
+ sys.exit()
+ d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+ elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ make_local_parameter(d[s]['PARAM'], line)
+
+ return d
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_LIST':''} # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
+
+## main
+param_file = sys.argv[1]
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+TPM_TABLE = os.path.abspath(TPM_TABLE)
+save_TPM_table(get_gene_list(global_param_dict['GENE_LIST']), get_dict_list(data_dict), TPM_TABLE)
+#print('Done. Check %s.' % (TPM_TABLE))