brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/make_parameter_rnaseq.py
1 files changed, 163 insertions, 0 deletions
diff --git a/Code/make_parameter_rnaseq.py b/Code/make_parameter_rnaseq.py
new file mode 100644
index 0000000..1fe9c6e
--- /dev/null
+++ b/Code/make_parameter_rnaseq.py
@@ -0,0 +1,163 @@
+# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
+#        Edit QUANT_PATH, set NON_ZERO_RATIO.
+#
+# Purpose: automatically generate parameter_for_buildRmatrix.txt
+#
+# Update: 26 Feb 2017, slcu, hui
+# Update: 19 Sep 2019, slcu, hui [add  read_ena_data_info_json]
+
+import sys, os, glob, json
+import fnmatch, re
+from configure import RNA_SEQ_INFO_FILE
+
+NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
+QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other']  # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.
+
+def get_quant_file_list(fname):
+    f = open(fname)
+    result = []
+    for line in f:
+        line = line.strip()
+        lst = line.split()
+        result.append(lst[0])
+    f.close()
+    return result
+
+def extract_id(s, src):
+    if src == 'inhouse' or src == 'other':
+        dirname = os.path.dirname(s)
+        lst = dirname.split('/')
+        parent_dir = lst[-1]
+        libno = filter(str.isdigit, parent_dir) # extrac library number
+        libno = libno.zfill(3)
+        sample_no = os.path.basename(s) # extract sample number
+        first_match = re.findall('_S\d+_', sample_no)[0]
+        sample_no = filter(str.isdigit, first_match).zfill(2)
+        return '0000' + libno + sample_no
+    if src == 'sra':
+        index = s.find('_quant')
+        if index > 0:
+            return s[:index]
+        else:
+            return 'NA'
+    return 'NA'
+
+def zfill2(s, n):
+    return s + 'X' * (n-len(s))
+    
+def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
+    if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
+        result = 'R' + success_flag + lab_id + zfill2(myid, 9)
+    else: # inhouse or other
+        result = 'R' + success_flag + lab_id + myid
+    return result
+    
+def glob_files_include_path(directory, pattern):
+    ''' return all file names (with paths) given directory and pattern '''
+    result = []
+    for root, dirnames, filenames in os.walk(directory):
+        for filename in fnmatch.filter(filenames, pattern):
+            result.append(os.path.join(root, filename))
+    return result
+
+def glob_files_include_paths(directory_lst, pattern):
+    ''' return all file names (with paths) given directory and pattern '''
+    result = []
+    for directory in directory_lst:
+        result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
+    return result
+
+def non_zero_ratio(fname):
+    non_zero_count = 0
+    total_count = 0
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split()
+        tpm = lst[3]
+        if not tpm == '0' and not 'nan' in tpm:
+            non_zero_count += 1
+        total_count += 1
+    return 1.0 * non_zero_count / total_count
+
+def get_rna_word_count(s):
+    ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
+    count = 0
+    s = s.lower()
+    if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
+        count = 1
+    return count
+
+def read_ena_data_info_json(fname):
+    d = {}
+    with open(fname) as json_data:
+        json_dict = json.load(json_data)
+        for run_id in json_dict:
+           d[run_id] = 1
+    return d
+
+### main
+if not os.path.exists(RNA_SEQ_INFO_FILE):
+    print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
+    sys.exit()
+    
+rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)
+
+
+print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head
+
+if len(sys.argv) > 1: # only get these samples specified in id-list.txt
+    quant_files = get_quant_file_list(sys.argv[1])
+else:
+    quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')
+
+
+include_count = 0
+total = 0
+already_added_dict = {}
+for fn in sorted(quant_files):
+    total += 1
+
+    nzr = non_zero_ratio(fn)
+    if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
+        fn2 = os.path.basename(fn)
+        data_name = 'None'
+        if 'inhouse' in fn:
+            myid = extract_id(fn, 'inhouse')
+            myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
+            data_name = 'LAB001' + '_LIB' + myid[-5:]
+            desc = 'in-house'
+            include_me = True
+        elif 'public' in fn:
+            myid = extract_id(fn2, 'sra')                
+            myid2 = make_id('0',myid[0:3], myid[3:])
+            data_name = myid
+            desc = 'SRA'
+            include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
+        elif 'other' in fn:
+            myid = extract_id(fn, 'other')
+            # get lab id
+            dirname = os.path.dirname(fn)
+            lst = dirname.split('/')
+            lab_id = lst[-2]
+            myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
+            data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
+            desc = 'Other lab'
+            include_me = True
+        if include_me and not myid2 in already_added_dict:
+            print('@%s' % (myid2))
+            print('DATA_NAME:%s' % (data_name))
+            print('DATA_FORMAT:%s'% ('txt'))
+            print('DESCRIPTION:%s' % (desc))
+            print('LOCATION:%s' % (fn))
+            print('NOTE: non zero ratio is %4.2f' % (nzr))
+            print('')
+            include_count += 1
+            already_added_dict[myid2] = 'yes'
+    else:
+        #print('%s has too many zeroes. ignore.' % (fn))
+        pass
+
+print('#Done.  Processed %d files.  Included %d files.' % (total, include_count))
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/make_parameter_rnaseq.py