# Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt # Watch out NA values in TPM.txt, these genes don't have any gene expression information. # # Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID. # # 23 Dec 2016, hui, slcu # Last modified 5 Apr 2017, hui, slcu # Last modified 25 Oct 2019, hui, zjnu [Comments; add a variable WARN_NA to turn on/off print NA warnings.] # Last modified 10 Oct 2020, hui, zjnu [note that if there are more than 1000 RNA-seq samples, this script requires at least 7GB memory to run.] import os, sys, glob from configure import TPM_FILE from utils import write_first_column, append_column_fast WARN_NA = False #################################### GLB_PARAM_SYMBOL = '%%' LCL_PARAM_SYMBOL = '%' DATA_SYMBOL = '@' #################################### def common_part(s): ''' s is expected to have this form: AT1G01020.1, remove .1 ''' s = s.strip() index = s.find('.') if index < 0: # not found, -1 return s return s[0:index] def make_expression_dict(fname, myid): ''' fname -- salmon file myid -- RNA-seq experiment ID The retured value is a dictionary which looks like { 'ID': RNA-seq experiment ID 'isoform': { 'AT1G12345': [], 'AT2G12345': [], ... } } Each gene ID (e.g., AT1G12345) has a number of isoforms which gives different expression levels. ''' ID_COL = 0 # Salmon's quant.sf file, first column is gene ID TPM_COL = 3 # Salmon's quant.sf file, fourth column is TPM if not os.path.exists(fname): print('ERROR [buildRmatrix.py]: file %s not exists.' % (fname)) sys.exit() d = {'ID':myid, 'isoform':{}} f = open(fname) lines = f.readlines() f.close() for line in lines[1:]: # ignore head line, Name Length EffectiveLength TPM NumReads line = line.strip() lst = line.split() gene_id = lst[ID_COL] tpm = float(lst[TPM_COL]) common = common_part(gene_id) # gene id without .1, .2, etc. if not common in d['isoform']: d['isoform'][common] = [tpm] else: d['isoform'][common].append(tpm) # make the dictionary smaller by using a string instead of a double-precision float number, so it requires less memory. Cut from 7.44G to 6.5G for 1003 TPM files. for g in d['isoform']: d['isoform'][g] = '%s' % get_max_expressed_isoform_save_space(g, d) return d def get_max_expressed_isoform_save_space(g, d): ''' Evolved from get_max_expressed_isoform(g, d) ''' if not g in d['isoform']: return '-9' lst = d['isoform'][g] return '%4.2f' % max(lst) def make_gene_expression_list(gene_lst, d): ''' gene_lst: a list of genes dict: each dictionary contains gene expression inforamtion. ''' result = [] bad_count = 0 # number of NA gene expression levels. We wish this number to be far smaller than total_count. for g in gene_lst: v = 'NA' if g in d['isoform']: v = d['isoform'][g] else: bad_count += 1 if WARN_NA: print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID'])) result.append(v) total_count = len(gene_lst) if 1.0 * bad_count / total_count > 0.0 and WARN_NA: print('WARNING [buildRmatrix.py]: %s contains NA values! \n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (d['ID'], bad_count, total_count, 100.0* bad_count/total_count, bad_count)) return result def get_gene_list(fname): f = open(fname) lst = [] for line in f: line = line.strip() if line != '': l = line.split()[0] lst.append(l) f.close() return lst def get_key_value(s): lst = s.split('=') k, v = lst[0], lst[1] return (k, v) def get_value(s, delimit): index = s.find(delimit) if index < 0: sys.exit() return s[index+1:].strip() def make_data_dict(fname): ''' fname - parameter_for_buildRmatrix.txt Return a dictionary which looks like { 'RNASEQ_ID_LIST': [], 'SRR1': { 'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt } } ''' d = {'RNASEQ_ID_LIST':[]} # RNASEQ_ID_LIST is a list of RNA-seq experiment IDs f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line == '' or line.startswith('#'): continue if line.startswith(DATA_SYMBOL): s = line[line.rfind(DATA_SYMBOL[-1])+1:] s = s.strip() if s in d: print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s)) sys.exit() d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} d['RNASEQ_ID_LIST'].append(s) if line.startswith('DESCRIPTION:'): d[s]['DESCRIPTION'] = get_value(line, ':') elif line.startswith('DATA_FORMAT:'): d[s]['DATA_NAME'] = get_value(line, ':') elif line.startswith('DATA_FORMAT:'): d[s]['DATA_FORMAT'] = get_value(line, ':') elif line.startswith('LOCATION:'): d[s]['LOCATION'] = get_value(line, ':') elif line.startswith('NOTE:'): d[s]['NOTE'] = get_value(line, ':') elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): make_local_parameter(d[s]['PARAM'], line) return d def make_global_param_dict(fname): f = open(fname) d = {'GENE_LIST':''} # change for line in f: line = line.strip() if line.startswith(GLB_PARAM_SYMBOL): s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] lst = s.split('\t') # separate items by TAB for x in lst: if x != '': k, v = get_key_value(x) d[k] = v f.close() return d ## main param_file = sys.argv[1] global_param_dict = make_global_param_dict(param_file) gene_lst = get_gene_list(global_param_dict['GENE_LIST']) data_dict = make_data_dict(param_file) TPM_FILE = os.path.abspath(TPM_FILE) write_first_column(['gene_id'] + gene_lst, TPM_FILE) num_rnaseq = len(data_dict['RNASEQ_ID_LIST']) print('[buildRmatrix.py] Total RNA-seq data %d' % num_rnaseq) total = 0 for experiment_id in data_dict['RNASEQ_ID_LIST']: total += 1 print('\rNow appending %s (%4.2f%% finished)' % (experiment_id, 100*total/num_rnaseq), end='') d = make_expression_dict(data_dict[experiment_id]['LOCATION'], experiment_id) gene_expression_lst = make_gene_expression_list(gene_lst, d) result = append_column_fast(TPM_FILE, [experiment_id] + gene_expression_lst)