From 823db708ed6b37a44a970d5f5a3e9c765aa83c16 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 10 Feb 2021 14:26:09 +0800 Subject: download_and_map.py: adpat this script to use the latest rnaseq_info_database.json file. --- Code/download_and_map.py | 11 ++++++----- Code/test.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 Code/test.py diff --git a/Code/download_and_map.py b/Code/download_and_map.py index b32b4e5..f23e241 100644 --- a/Code/download_and_map.py +++ b/Code/download_and_map.py @@ -13,6 +13,7 @@ # 23 DEC 2016, hui, slcu. Updated: 9 Feb 2017 # Last modified 10 APR 2017, hui, slcu # Last reviewed 31 July 2018 +# Last revised 10 Feb 2021 import os, sys, glob, json import fnmatch @@ -75,7 +76,7 @@ def get_list(fname): return result # only return unique elements -def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict): +def make_download_list(mapped_dir, rna_data_info_dict): ''' Make next n sample IDs. These samples must have not been downloaded yet. @@ -88,9 +89,9 @@ def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict): mapped_files = glob_files(mapped_dir, '*_quant.txt') mapped_run_ids = get_list(DOWNLOADED_SRA_ID_LOG_FILE) small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these files are too small - for x in sorted(all_run_ids, reverse=True): # SRR first, then ERR, then DRR - include_me = True if x in rna_data_info_dict and rna_data_info_dict[x] > 0 else False - if not (x + '_quant.txt') in mapped_files and not x in result and (not x in small_ids) and (not x in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq + for run_id in sorted(rna_data_info_dict.keys(), reverse=True): # SRR first, then ERR, then DRR + include_me = True if d[run_id]['library_strategy'].lower() == 'rna-seq' and d[run_id]['library_source'].lower() == 'transcriptomic' else False + if not (run_id + '_quant.txt') in mapped_files and (not run_id in result) and (not run_id in small_ids) and (not run_id in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq result.append(x) return result @@ -373,7 +374,7 @@ if len(sys.argv) > 1: # user has provided a list of IDs in a file DAILY_MAP_NUMBER = len(download_list) else: print('[download_and_map.py] Prepare download list ...') - download_list = make_download_list(rna_data_info_dict.keys(), MAPPED_RDATA_DIR, rna_data_info_dict) + download_list = make_download_list(MAPPED_RDATA_DIR, rna_data_info_dict) print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER)) diff --git a/Code/test.py b/Code/test.py new file mode 100644 index 0000000..0ddae43 --- /dev/null +++ b/Code/test.py @@ -0,0 +1,16 @@ +import json + +def read_ena_data_info_json(fname): + d = {} + with open(fname) as json_data: + json_dict = json.load(json_data) + for run_id in json_dict: + if d[run_id]['library_strategy'].lower() == 'rna-seq' and d[run_id]['library_source'].lower() == 'transcriptomic': + d[run_id] = 1 + return d + + +d = read_ena_data_info_json('../Data/information/rnaseq_info_database.json.temp') +for k in d: + print('%s\t%s' % (d[k]['library_strategy'], d[k]['library_source'])) + -- cgit v1.2.1