From 9458009e4944953c1cfbbcb331ffb3dda2c1e4e6 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Sun, 13 Dec 2020 10:16:39 +0800 Subject: make_parameter_rnaseq.py: if a line in quant.sf is not complete, ignore this quant.sf. --- Code/make_parameter_rnaseq.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Code/make_parameter_rnaseq.py b/Code/make_parameter_rnaseq.py index 1fe9c6e..18ef568 100644 --- a/Code/make_parameter_rnaseq.py +++ b/Code/make_parameter_rnaseq.py @@ -8,7 +8,8 @@ import sys, os, glob, json import fnmatch, re -from configure import RNA_SEQ_INFO_FILE +from datetime import datetime +from configure import RNA_SEQ_INFO_FILE, UPDATE_NETWORK_LOG_FILE NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros. QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other'] # places where all _quant.txt reside. _quant.txt in sub-directories will also be used. @@ -76,6 +77,8 @@ def non_zero_ratio(fname): for line in lines[1:]: line = line.strip() lst = line.split() + if len(lst) < 4: # this should not occur. Report error if occurred. + return -1 tpm = lst[3] if not tpm == '0' and not 'nan' in tpm: non_zero_count += 1 @@ -98,6 +101,16 @@ def read_ena_data_info_json(fname): d[run_id] = 1 return d + +def write_log_file(s, fname): + f = open(fname, 'a') + curr_time = datetime.now().strftime('%Y-%m-%d %H:%M') + s = '[' + curr_time + ']: ' + s + if not '\n' in s: + s += '\n' + f.write(s) + f.close() + ### main if not os.path.exists(RNA_SEQ_INFO_FILE): print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE)) @@ -156,6 +169,8 @@ for fn in sorted(quant_files): print('') include_count += 1 already_added_dict[myid2] = 'yes' + elif nzr < 0: + write_log_file('[make_parameter_rnaseq.py] Warning: incomplete line in file %s' % (fn), UPDATE_NETWORK_LOG_FILE) else: #print('%s has too many zeroes. ignore.' % (fn)) pass -- cgit v1.2.1