diff options
Diffstat (limited to 'Code')
-rw-r--r-- | Code/download_and_map.py | 11 | ||||
-rw-r--r-- | Code/test.py | 16 |
2 files changed, 22 insertions, 5 deletions
diff --git a/Code/download_and_map.py b/Code/download_and_map.py index b32b4e5..f23e241 100644 --- a/Code/download_and_map.py +++ b/Code/download_and_map.py @@ -13,6 +13,7 @@ # 23 DEC 2016, hui, slcu. Updated: 9 Feb 2017
# Last modified 10 APR 2017, hui, slcu
# Last reviewed 31 July 2018
+# Last revised 10 Feb 2021
import os, sys, glob, json
import fnmatch
@@ -75,7 +76,7 @@ def get_list(fname): return result # only return unique elements
-def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
+def make_download_list(mapped_dir, rna_data_info_dict):
'''
Make next n sample IDs. These samples must have not been downloaded yet.
@@ -88,9 +89,9 @@ def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict): mapped_files = glob_files(mapped_dir, '*_quant.txt')
mapped_run_ids = get_list(DOWNLOADED_SRA_ID_LOG_FILE)
small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these files are too small
- for x in sorted(all_run_ids, reverse=True): # SRR first, then ERR, then DRR
- include_me = True if x in rna_data_info_dict and rna_data_info_dict[x] > 0 else False
- if not (x + '_quant.txt') in mapped_files and not x in result and (not x in small_ids) and (not x in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
+ for run_id in sorted(rna_data_info_dict.keys(), reverse=True): # SRR first, then ERR, then DRR
+ include_me = True if d[run_id]['library_strategy'].lower() == 'rna-seq' and d[run_id]['library_source'].lower() == 'transcriptomic' else False
+ if not (run_id + '_quant.txt') in mapped_files and (not run_id in result) and (not run_id in small_ids) and (not run_id in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
result.append(x)
return result
@@ -373,7 +374,7 @@ if len(sys.argv) > 1: # user has provided a list of IDs in a file DAILY_MAP_NUMBER = len(download_list)
else:
print('[download_and_map.py] Prepare download list ...')
- download_list = make_download_list(rna_data_info_dict.keys(), MAPPED_RDATA_DIR, rna_data_info_dict)
+ download_list = make_download_list(MAPPED_RDATA_DIR, rna_data_info_dict)
print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER))
diff --git a/Code/test.py b/Code/test.py new file mode 100644 index 0000000..0ddae43 --- /dev/null +++ b/Code/test.py @@ -0,0 +1,16 @@ +import json + +def read_ena_data_info_json(fname): + d = {} + with open(fname) as json_data: + json_dict = json.load(json_data) + for run_id in json_dict: + if d[run_id]['library_strategy'].lower() == 'rna-seq' and d[run_id]['library_source'].lower() == 'transcriptomic': + d[run_id] = 1 + return d + + +d = read_ena_data_info_json('../Data/information/rnaseq_info_database.json.temp') +for k in d: + print('%s\t%s' % (d[k]['library_strategy'], d[k]['library_source'])) + |