summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--Code/configure.py2
-rw-r--r--Code/download_and_map.py27
3 files changed, 22 insertions, 9 deletions
diff --git a/.gitignore b/.gitignore
index 52dc540..b9ca725 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ Data/*
Code/*.old*
Code/*.pyc*
Code/__pycache__/
+
+brain-Code-20191226.tgz
diff --git a/Code/configure.py b/Code/configure.py
index c740e98..ed6574d 100644
--- a/Code/configure.py
+++ b/Code/configure.py
@@ -6,7 +6,7 @@ SALMON_MAP_RESULT_DIR = '../Data/temp/salmon_map_result'
KMER = 31
# From download_and_map.py
-DAILY_MAP_NUMBER = 5 # download this many samples each time. I have tested the values of 3, 4, 5, 8.
+DAILY_MAP_NUMBER = 4 # download this many samples each time. I have tested the values of 3, 4, 5, 8.
MIN_FASTQ_FILE_SIZE = 200000000 # in bytes, approximately 200MB
RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json' # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq
DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs
diff --git a/Code/download_and_map.py b/Code/download_and_map.py
index 95a4753..3a22315 100644
--- a/Code/download_and_map.py
+++ b/Code/download_and_map.py
@@ -21,7 +21,7 @@ import re
from datetime import datetime
##########################################################################################
-from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR
+from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, UPDATE_NETWORK_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR
FASTQ_DUMP_PATH = '/home/hui/software/sratoolkit/sratoolkit.2.8.0-ubuntu64/bin/fastq-dump'
@@ -185,7 +185,7 @@ def download_and_map_data(lst, daily_map_num, dest):
url_lst = get_file_url('../Data/temp/wget_temp_file0.txt')
if url_lst == []:
- write_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')
+ write_download_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')
time.sleep(1)
@@ -202,7 +202,7 @@ def download_and_map_data(lst, daily_map_num, dest):
print('[download_and_map.py] IGNORE [%d MB] %s' % (int(sz/1000000.0), link))
file_name = os.path.basename(link)
sample_id = get_sample_id(file_name)
- write_log_file(IGNORED_SRA_ID_LOG_FILE, sample_id+'\n')
+ write_download_log_file(IGNORED_SRA_ID_LOG_FILE, sample_id+'\n')
print(curr_lst)
@@ -253,7 +253,7 @@ def download_data2(lst, dest):
for fname in glob.glob( os.path.join(dest, '%s*gz' % (run_id)) ) :
downloaded_files.append(fname)
else:
- write_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')
+ write_download_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')
return downloaded_files
@@ -273,7 +273,7 @@ def salmon_map(lst):
os.system(cmd)
-def write_log_file(fname, s):
+def write_download_log_file(fname, s):
if not os.path.exists(fname):
f = open(fname, 'w')
else:
@@ -282,6 +282,16 @@ def write_log_file(fname, s):
f.close()
+def write_network_log_file(s, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+
+
def last_session_finished(fname):
''' return true if log file ends with DONE. '''
if not os.path.exists(fname):
@@ -345,6 +355,7 @@ if not os.path.exists(RNA_SEQ_INFO_FILE):
available_G = 4 * os.statvfs('/home').f_bavail / (1024*1024) # compute available space (in G). Each block has 4k bytes, work for Linux/UNIX systems only
if available_G < 3 * DAILY_MAP_NUMBER:
print('[download_and_map.py] home directory does not have enough space (only %d G available) ' % (available_G))
+ write_network_log_file('[download_and_map.py] home directory does not have enough space (only %d G available).' % (available_G), UPDATE_NETWORK_LOG_FILE)
sys.exit()
if not last_session_finished(DOWNLOADED_SRA_ID_LOG_FILE): # last session not finished
@@ -365,7 +376,7 @@ else:
# Make a record in log.txt
curr_time = datetime.now().strftime('%Y-%m-%d_%H%M') # append date info to newly created directories
-write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'START at %s\n' % (curr_time))
+write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'START at %s\n' % (curr_time))
# Download these RNA-seq IDs and map them using salmon
print('[download_and_map.py] Start downloading and mapping ...')
@@ -386,5 +397,5 @@ else:
print('[download_and_map.py] No quant files to move.')
-write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, '%s\n' % ('\n'.join(map_list)))
-write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'DONE at %s\n' % (curr_time))
+write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, '%s\n' % ('\n'.join(map_list)))
+write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'DONE at %s\n' % (curr_time))