summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Code/download_and_map.py11
-rw-r--r--Code/test.py16
2 files changed, 22 insertions, 5 deletions
diff --git a/Code/download_and_map.py b/Code/download_and_map.py
index b32b4e5..f23e241 100644
--- a/Code/download_and_map.py
+++ b/Code/download_and_map.py
@@ -13,6 +13,7 @@
# 23 DEC 2016, hui, slcu. Updated: 9 Feb 2017
# Last modified 10 APR 2017, hui, slcu
# Last reviewed 31 July 2018
+# Last revised 10 Feb 2021
import os, sys, glob, json
import fnmatch
@@ -75,7 +76,7 @@ def get_list(fname):
return result # only return unique elements
-def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
+def make_download_list(mapped_dir, rna_data_info_dict):
'''
Make next n sample IDs. These samples must have not been downloaded yet.
@@ -88,9 +89,9 @@ def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
mapped_files = glob_files(mapped_dir, '*_quant.txt')
mapped_run_ids = get_list(DOWNLOADED_SRA_ID_LOG_FILE)
small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these files are too small
- for x in sorted(all_run_ids, reverse=True): # SRR first, then ERR, then DRR
- include_me = True if x in rna_data_info_dict and rna_data_info_dict[x] > 0 else False
- if not (x + '_quant.txt') in mapped_files and not x in result and (not x in small_ids) and (not x in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
+ for run_id in sorted(rna_data_info_dict.keys(), reverse=True): # SRR first, then ERR, then DRR
+ include_me = True if d[run_id]['library_strategy'].lower() == 'rna-seq' and d[run_id]['library_source'].lower() == 'transcriptomic' else False
+ if not (run_id + '_quant.txt') in mapped_files and (not run_id in result) and (not run_id in small_ids) and (not run_id in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
result.append(x)
return result
@@ -373,7 +374,7 @@ if len(sys.argv) > 1: # user has provided a list of IDs in a file
DAILY_MAP_NUMBER = len(download_list)
else:
print('[download_and_map.py] Prepare download list ...')
- download_list = make_download_list(rna_data_info_dict.keys(), MAPPED_RDATA_DIR, rna_data_info_dict)
+ download_list = make_download_list(MAPPED_RDATA_DIR, rna_data_info_dict)
print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER))
diff --git a/Code/test.py b/Code/test.py
new file mode 100644
index 0000000..0ddae43
--- /dev/null
+++ b/Code/test.py
@@ -0,0 +1,16 @@
+import json
+
+def read_ena_data_info_json(fname):
+ d = {}
+ with open(fname) as json_data:
+ json_dict = json.load(json_data)
+ for run_id in json_dict:
+ if d[run_id]['library_strategy'].lower() == 'rna-seq' and d[run_id]['library_source'].lower() == 'transcriptomic':
+ d[run_id] = 1
+ return d
+
+
+d = read_ena_data_info_json('../Data/information/rnaseq_info_database.json.temp')
+for k in d:
+ print('%s\t%s' % (d[k]['library_strategy'], d[k]['library_source']))
+