# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt # Edit QUANT_PATH, set NON_ZERO_RATIO. # # Purpose: automatically generate parameter_for_buildRmatrix.txt # # Update: 26 Feb 2017, slcu, hui # Update: 19 Sep 2019, slcu, hui [add read_ena_data_info_json] import sys, os, glob, json import fnmatch, re from configure import RNA_SEQ_INFO_FILE NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros. QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other'] # places where all _quant.txt reside. _quant.txt in sub-directories will also be used. def get_quant_file_list(fname): f = open(fname) result = [] for line in f: line = line.strip() lst = line.split() result.append(lst[0]) f.close() return result def extract_id(s, src): if src == 'inhouse' or src == 'other': dirname = os.path.dirname(s) lst = dirname.split('/') parent_dir = lst[-1] libno = filter(str.isdigit, parent_dir) # extrac library number libno = libno.zfill(3) sample_no = os.path.basename(s) # extract sample number first_match = re.findall('_S\d+_', sample_no)[0] sample_no = filter(str.isdigit, first_match).zfill(2) return '0000' + libno + sample_no if src == 'sra': index = s.find('_quant') if index > 0: return s[:index] else: return 'NA' return 'NA' def zfill2(s, n): return s + 'X' * (n-len(s)) def make_id(success_flag, lab_id, myid): # should also work for non-SRA id if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR': result = 'R' + success_flag + lab_id + zfill2(myid, 9) else: # inhouse or other result = 'R' + success_flag + lab_id + myid return result def glob_files_include_path(directory, pattern): ''' return all file names (with paths) given directory and pattern ''' result = [] for root, dirnames, filenames in os.walk(directory): for filename in fnmatch.filter(filenames, pattern): result.append(os.path.join(root, filename)) return result def glob_files_include_paths(directory_lst, pattern): ''' return all file names (with paths) given directory and pattern ''' result = [] for directory in directory_lst: result.extend(glob_files_include_path(os.path.abspath(directory), pattern)) return result def non_zero_ratio(fname): non_zero_count = 0 total_count = 0 f = open(fname) lines = f.readlines() f.close() for line in lines[1:]: line = line.strip() lst = line.split() tpm = lst[3] if not tpm == '0' and not 'nan' in tpm: non_zero_count += 1 total_count += 1 return 1.0 * non_zero_count / total_count def get_rna_word_count(s): ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. ''' count = 0 s = s.lower() if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s: count = 1 return count def read_ena_data_info_json(fname): d = {} with open(fname) as json_data: json_dict = json.load(json_data) for run_id in json_dict: d[run_id] = 1 return d ### main if not os.path.exists(RNA_SEQ_INFO_FILE): print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE)) sys.exit() rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE) print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head if len(sys.argv) > 1: # only get these samples specified in id-list.txt quant_files = get_quant_file_list(sys.argv[1]) else: quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt') include_count = 0 total = 0 already_added_dict = {} for fn in sorted(quant_files): total += 1 nzr = non_zero_ratio(fn) if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored fn2 = os.path.basename(fn) data_name = 'None' if 'inhouse' in fn: myid = extract_id(fn, 'inhouse') myid2 = make_id('0','001', myid) # lab id is 001, phil's lab data_name = 'LAB001' + '_LIB' + myid[-5:] desc = 'in-house' include_me = True elif 'public' in fn: myid = extract_id(fn2, 'sra') myid2 = make_id('0',myid[0:3], myid[3:]) data_name = myid desc = 'SRA' include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT elif 'other' in fn: myid = extract_id(fn, 'other') # get lab id dirname = os.path.dirname(fn) lst = dirname.split('/') lab_id = lst[-2] myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab data_name = 'LAB' + lab_id + '_LIB' + myid[-5:] desc = 'Other lab' include_me = True if include_me and not myid2 in already_added_dict: print('@%s' % (myid2)) print('DATA_NAME:%s' % (data_name)) print('DATA_FORMAT:%s'% ('txt')) print('DESCRIPTION:%s' % (desc)) print('LOCATION:%s' % (fn)) print('NOTE: non zero ratio is %4.2f' % (nzr)) print('') include_count += 1 already_added_dict[myid2] = 'yes' else: #print('%s has too many zeroes. ignore.' % (fn)) pass print('#Done. Processed %d files. Included %d files.' % (total, include_count))