From 823db708ed6b37a44a970d5f5a3e9c765aa83c16 Mon Sep 17 00:00:00 2001
From: Hui Lan <lanhui@zjnu.edu.cn>
Date: Wed, 10 Feb 2021 14:26:09 +0800
Subject: download_and_map.py: adpat this script to use the latest
 rnaseq_info_database.json file.

---
 Code/download_and_map.py | 11 ++++++-----
 Code/test.py             | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 5 deletions(-)
 create mode 100644 Code/test.py

diff --git a/Code/download_and_map.py b/Code/download_and_map.py
index b32b4e5..f23e241 100644
--- a/Code/download_and_map.py
+++ b/Code/download_and_map.py
@@ -13,6 +13,7 @@
 # 23 DEC 2016, hui, slcu. Updated: 9 Feb 2017
 # Last modified 10 APR 2017, hui, slcu
 # Last reviewed 31 July 2018
+# Last revised 10 Feb 2021
 
 import os, sys, glob, json
 import fnmatch
@@ -75,7 +76,7 @@ def get_list(fname):
     return result # only return unique elements
 
 
-def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
+def make_download_list(mapped_dir, rna_data_info_dict):
     ''' 
     Make next n sample IDs.  These samples must have not been downloaded yet.  
 
@@ -88,9 +89,9 @@ def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
     mapped_files = glob_files(mapped_dir, '*_quant.txt')
     mapped_run_ids = get_list(DOWNLOADED_SRA_ID_LOG_FILE)
     small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these files are too small
-    for x in sorted(all_run_ids, reverse=True): # SRR first, then ERR, then DRR
-        include_me = True if x in rna_data_info_dict and rna_data_info_dict[x] > 0 else False
-        if not (x + '_quant.txt') in mapped_files and not x in result and (not x in small_ids) and (not x in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
+    for run_id in sorted(rna_data_info_dict.keys(), reverse=True): # SRR first, then ERR, then DRR
+        include_me = True if d[run_id]['library_strategy'].lower() == 'rna-seq' and  d[run_id]['library_source'].lower() == 'transcriptomic' else False
+        if not (run_id + '_quant.txt') in mapped_files and (not run_id in result) and (not run_id in small_ids) and (not run_id in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
             result.append(x)
     return result
 
@@ -373,7 +374,7 @@ if len(sys.argv) > 1:  # user has provided a list of IDs in a file
     DAILY_MAP_NUMBER = len(download_list)
 else:
     print('[download_and_map.py] Prepare download list ...')
-    download_list = make_download_list(rna_data_info_dict.keys(), MAPPED_RDATA_DIR, rna_data_info_dict)
+    download_list = make_download_list(MAPPED_RDATA_DIR, rna_data_info_dict)
     print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER))
 
 
diff --git a/Code/test.py b/Code/test.py
new file mode 100644
index 0000000..0ddae43
--- /dev/null
+++ b/Code/test.py
@@ -0,0 +1,16 @@
+import json
+
+def read_ena_data_info_json(fname):
+    d = {}
+    with open(fname) as json_data:
+        json_dict = json.load(json_data)
+        for run_id in json_dict:
+            if d[run_id]['library_strategy'].lower() == 'rna-seq' and  d[run_id]['library_source'].lower() == 'transcriptomic':
+                d[run_id] = 1
+    return d
+
+
+d = read_ena_data_info_json('../Data/information/rnaseq_info_database.json.temp')
+for k in d:
+    print('%s\t%s' % (d[k]['library_strategy'], d[k]['library_source']))
+
-- 
cgit v1.2.1