From 5a70b3b498e64dc903b017d45be09a808cfb2b89 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Sat, 18 Jan 2020 18:45:28 +0800 Subject: download_and_map.py: write "no enough space" warning message to network log file If there is no enough space left in the disk, download_and_map.py will refuse to download any data. This can be quite mysterious for a maintainer. So, write the reason to the network log file. The reason is something like: "[download_and_map.py] home directory does not have enough space (only 13 G available)." -Hui --- Code/configure.py | 2 +- Code/download_and_map.py | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'Code') diff --git a/Code/configure.py b/Code/configure.py index c740e98..ed6574d 100644 --- a/Code/configure.py +++ b/Code/configure.py @@ -6,7 +6,7 @@ SALMON_MAP_RESULT_DIR = '../Data/temp/salmon_map_result' KMER = 31 # From download_and_map.py -DAILY_MAP_NUMBER = 5 # download this many samples each time. I have tested the values of 3, 4, 5, 8. +DAILY_MAP_NUMBER = 4 # download this many samples each time. I have tested the values of 3, 4, 5, 8. MIN_FASTQ_FILE_SIZE = 200000000 # in bytes, approximately 200MB RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json' # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs diff --git a/Code/download_and_map.py b/Code/download_and_map.py index 95a4753..3a22315 100644 --- a/Code/download_and_map.py +++ b/Code/download_and_map.py @@ -21,7 +21,7 @@ import re from datetime import datetime ########################################################################################## -from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR +from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, UPDATE_NETWORK_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR FASTQ_DUMP_PATH = '/home/hui/software/sratoolkit/sratoolkit.2.8.0-ubuntu64/bin/fastq-dump' @@ -185,7 +185,7 @@ def download_and_map_data(lst, daily_map_num, dest): url_lst = get_file_url('../Data/temp/wget_temp_file0.txt') if url_lst == []: - write_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n') + write_download_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n') time.sleep(1) @@ -202,7 +202,7 @@ def download_and_map_data(lst, daily_map_num, dest): print('[download_and_map.py] IGNORE [%d MB] %s' % (int(sz/1000000.0), link)) file_name = os.path.basename(link) sample_id = get_sample_id(file_name) - write_log_file(IGNORED_SRA_ID_LOG_FILE, sample_id+'\n') + write_download_log_file(IGNORED_SRA_ID_LOG_FILE, sample_id+'\n') print(curr_lst) @@ -253,7 +253,7 @@ def download_data2(lst, dest): for fname in glob.glob( os.path.join(dest, '%s*gz' % (run_id)) ) : downloaded_files.append(fname) else: - write_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n') + write_download_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n') return downloaded_files @@ -273,7 +273,7 @@ def salmon_map(lst): os.system(cmd) -def write_log_file(fname, s): +def write_download_log_file(fname, s): if not os.path.exists(fname): f = open(fname, 'w') else: @@ -282,6 +282,16 @@ def write_log_file(fname, s): f.close() +def write_network_log_file(s, fname): + f = open(fname, 'a') + curr_time = datetime.now().strftime('%Y-%m-%d %H:%M') + s = '[' + curr_time + ']: ' + s + if not '\n' in s: + s += '\n' + f.write(s) + f.close() + + def last_session_finished(fname): ''' return true if log file ends with DONE. ''' if not os.path.exists(fname): @@ -345,6 +355,7 @@ if not os.path.exists(RNA_SEQ_INFO_FILE): available_G = 4 * os.statvfs('/home').f_bavail / (1024*1024) # compute available space (in G). Each block has 4k bytes, work for Linux/UNIX systems only if available_G < 3 * DAILY_MAP_NUMBER: print('[download_and_map.py] home directory does not have enough space (only %d G available) ' % (available_G)) + write_network_log_file('[download_and_map.py] home directory does not have enough space (only %d G available).' % (available_G), UPDATE_NETWORK_LOG_FILE) sys.exit() if not last_session_finished(DOWNLOADED_SRA_ID_LOG_FILE): # last session not finished @@ -365,7 +376,7 @@ else: # Make a record in log.txt curr_time = datetime.now().strftime('%Y-%m-%d_%H%M') # append date info to newly created directories -write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'START at %s\n' % (curr_time)) +write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'START at %s\n' % (curr_time)) # Download these RNA-seq IDs and map them using salmon print('[download_and_map.py] Start downloading and mapping ...') @@ -386,5 +397,5 @@ else: print('[download_and_map.py] No quant files to move.') -write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, '%s\n' % ('\n'.join(map_list))) -write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'DONE at %s\n' % (curr_time)) +write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, '%s\n' % ('\n'.join(map_list))) +write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'DONE at %s\n' % (curr_time)) -- cgit v1.2.1