summaryrefslogtreecommitdiff
path: root/Code/download_and_map.py
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-10-29 18:00:27 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-10-29 18:00:27 +0800
commit9a85ad752194846f26f2555d1f059f76ed31c43d (patch)
treec1b4bb5408e782c48f9528230f79f0365be02426 /Code/download_and_map.py
parentfafb342fe7e756c7a23b9d371565f089afdf18bd (diff)
Also look for a run's library strategy/source in EXPERIMENT_LIBRARY_INFO_FILE
Diffstat (limited to 'Code/download_and_map.py')
-rw-r--r--Code/download_and_map.py22
1 files changed, 19 insertions, 3 deletions
diff --git a/Code/download_and_map.py b/Code/download_and_map.py
index e60b150..e9fb14b 100644
--- a/Code/download_and_map.py
+++ b/Code/download_and_map.py
@@ -23,7 +23,7 @@ import json
from datetime import datetime
##########################################################################################
-from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, UPDATE_NETWORK_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR
+from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, EXPERIMENT_LIBRARY_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, UPDATE_NETWORK_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR
##########################################################################################
def glob_files(directory, pattern):
@@ -74,7 +74,7 @@ def get_list(fname):
return result # only return unique elements
-def make_download_list(mapped_dir, rna_data_info_dict):
+def make_download_list(mapped_dir, rna_data_info_dict, experiment_library_info_dict={}):
'''
Make next n sample IDs. These samples must have not been downloaded yet.
@@ -89,6 +89,10 @@ def make_download_list(mapped_dir, rna_data_info_dict):
small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these data files are too small
for run_id in sorted(rna_data_info_dict.keys(), reverse=True): # SRR first, then ERR, then DRR
include_me_because_i_am_rnaseq = True if rna_data_info_dict[run_id]['library_strategy'].lower() == 'rna-seq' and rna_data_info_dict[run_id]['library_source'].lower() == 'transcriptomic' else False
+ associated_experiment_id = rna_data_info_dict[run_id]['experiment_id']
+ if associated_experiment_id in experiment_library_info_dict:
+ is_rnaseq = 'rna-seq' in experiment_library_info_dict[associated_experiment_id]['library_strategy'].lower() and 'transcriptomic' in experiment_library_info_dict[associated_experiment_id]['library_source']
+ include_me_because_i_am_rnaseq = include_me_because_i_am_rnaseq or is_rnaseq
if not (run_id + '_quant.txt') in mapped_files and (not run_id in result) and (not run_id in small_ids) and (not run_id in mapped_run_ids) and include_me_because_i_am_rnaseq: # not mapped yet and is RNA-seq
result.append(run_id)
return result
@@ -352,6 +356,17 @@ def read_ena_data_info_json(fname):
return json_dict
+def read_experiment_library_information(fname):
+ result = {}
+ with open(fname) as f:
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ k = lst[0]
+ result[k] = {'library_strategy':lst[1], 'library_source':lst[2]}
+ return result
+
+
def read_run_ids_from_file(fname):
f = open(fname)
lst = []
@@ -411,6 +426,7 @@ if not last_session_finished(DOWNLOADED_SRA_ID_LOG_FILE): # last session not fin
sys.exit()
rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE) # rna_data_info_dict contains only RNA-seq IDs.
+experiment_library_info_dict = read_experiment_library_information(EXPERIMENT_LIBRARY_INFO_FILE)
# Generate DRR/ERR/SRR ids to download
if len(sys.argv) > 1: # user has provided a list of IDs in a file
@@ -418,7 +434,7 @@ if len(sys.argv) > 1: # user has provided a list of IDs in a file
DAILY_MAP_NUMBER = len(download_list)
else:
print('[download_and_map.py] Prepare download list ...')
- download_list = make_download_list(MAPPED_RDATA_DIR, rna_data_info_dict)
+ download_list = make_download_list(MAPPED_RDATA_DIR, rna_data_info_dict, experiment_library_info_dict)
print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER))
# Make a record in log.txt