summaryrefslogtreecommitdiff
path: root/Code/make_parameter_rnaseq.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/make_parameter_rnaseq.py
brain: add python and R code to local repository.
Diffstat (limited to 'Code/make_parameter_rnaseq.py')
-rw-r--r--Code/make_parameter_rnaseq.py163
1 files changed, 163 insertions, 0 deletions
diff --git a/Code/make_parameter_rnaseq.py b/Code/make_parameter_rnaseq.py
new file mode 100644
index 0000000..1fe9c6e
--- /dev/null
+++ b/Code/make_parameter_rnaseq.py
@@ -0,0 +1,163 @@
+# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
+# Edit QUANT_PATH, set NON_ZERO_RATIO.
+#
+# Purpose: automatically generate parameter_for_buildRmatrix.txt
+#
+# Update: 26 Feb 2017, slcu, hui
+# Update: 19 Sep 2019, slcu, hui [add read_ena_data_info_json]
+
+import sys, os, glob, json
+import fnmatch, re
+from configure import RNA_SEQ_INFO_FILE
+
+NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
+QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other'] # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.
+
+def get_quant_file_list(fname):
+ f = open(fname)
+ result = []
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def extract_id(s, src):
+ if src == 'inhouse' or src == 'other':
+ dirname = os.path.dirname(s)
+ lst = dirname.split('/')
+ parent_dir = lst[-1]
+ libno = filter(str.isdigit, parent_dir) # extrac library number
+ libno = libno.zfill(3)
+ sample_no = os.path.basename(s) # extract sample number
+ first_match = re.findall('_S\d+_', sample_no)[0]
+ sample_no = filter(str.isdigit, first_match).zfill(2)
+ return '0000' + libno + sample_no
+ if src == 'sra':
+ index = s.find('_quant')
+ if index > 0:
+ return s[:index]
+ else:
+ return 'NA'
+ return 'NA'
+
+def zfill2(s, n):
+ return s + 'X' * (n-len(s))
+
+def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
+ if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
+ result = 'R' + success_flag + lab_id + zfill2(myid, 9)
+ else: # inhouse or other
+ result = 'R' + success_flag + lab_id + myid
+ return result
+
+def glob_files_include_path(directory, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+ for root, dirnames, filenames in os.walk(directory):
+ for filename in fnmatch.filter(filenames, pattern):
+ result.append(os.path.join(root, filename))
+ return result
+
+def glob_files_include_paths(directory_lst, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+ for directory in directory_lst:
+ result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
+ return result
+
+def non_zero_ratio(fname):
+ non_zero_count = 0
+ total_count = 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ tpm = lst[3]
+ if not tpm == '0' and not 'nan' in tpm:
+ non_zero_count += 1
+ total_count += 1
+ return 1.0 * non_zero_count / total_count
+
+def get_rna_word_count(s):
+ ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
+ count = 0
+ s = s.lower()
+ if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
+ count = 1
+ return count
+
+def read_ena_data_info_json(fname):
+ d = {}
+ with open(fname) as json_data:
+ json_dict = json.load(json_data)
+ for run_id in json_dict:
+ d[run_id] = 1
+ return d
+
+### main
+if not os.path.exists(RNA_SEQ_INFO_FILE):
+ print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
+ sys.exit()
+
+rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)
+
+
+print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head
+
+if len(sys.argv) > 1: # only get these samples specified in id-list.txt
+ quant_files = get_quant_file_list(sys.argv[1])
+else:
+ quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')
+
+
+include_count = 0
+total = 0
+already_added_dict = {}
+for fn in sorted(quant_files):
+ total += 1
+
+ nzr = non_zero_ratio(fn)
+ if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
+ fn2 = os.path.basename(fn)
+ data_name = 'None'
+ if 'inhouse' in fn:
+ myid = extract_id(fn, 'inhouse')
+ myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
+ data_name = 'LAB001' + '_LIB' + myid[-5:]
+ desc = 'in-house'
+ include_me = True
+ elif 'public' in fn:
+ myid = extract_id(fn2, 'sra')
+ myid2 = make_id('0',myid[0:3], myid[3:])
+ data_name = myid
+ desc = 'SRA'
+ include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
+ elif 'other' in fn:
+ myid = extract_id(fn, 'other')
+ # get lab id
+ dirname = os.path.dirname(fn)
+ lst = dirname.split('/')
+ lab_id = lst[-2]
+ myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
+ data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
+ desc = 'Other lab'
+ include_me = True
+ if include_me and not myid2 in already_added_dict:
+ print('@%s' % (myid2))
+ print('DATA_NAME:%s' % (data_name))
+ print('DATA_FORMAT:%s'% ('txt'))
+ print('DESCRIPTION:%s' % (desc))
+ print('LOCATION:%s' % (fn))
+ print('NOTE: non zero ratio is %4.2f' % (nzr))
+ print('')
+ include_count += 1
+ already_added_dict[myid2] = 'yes'
+ else:
+ #print('%s has too many zeroes. ignore.' % (fn))
+ pass
+
+print('#Done. Processed %d files. Included %d files.' % (total, include_count))