From 57366ce96686f171d429af35ca99a8458a3bf666 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Tue, 9 Feb 2021 10:56:56 +0800 Subject: update_network.py: code review. --- Code/configure.py | 4 +-- Code/update_network.py | 83 ++++++++------------------------------------------ 2 files changed, 15 insertions(+), 72 deletions(-) diff --git a/Code/configure.py b/Code/configure.py index 9f159c1..5e329af 100644 --- a/Code/configure.py +++ b/Code/configure.py @@ -34,8 +34,8 @@ PARAMETER_FOR_NET_WIGGELAB_DIURNAL = '../Data/parameter/parameter_for_net_wig BINDING_FILE = '../Data/history/bind/binding.txt' TPM_FILE = '../Data/history/expr/TPM.txt' # gene expression data -PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL = 1 # check every 28 days for updating TPM.txt -MIN_RNA_SEQ_INCREASE = 2 # minimum RNA-seq experiments needed when updating parameter_for_buildRmatrix.txt +PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL = 14 # check every 28 days for updating TPM.txt +MIN_RNA_SEQ_INCREASE = 60 # minimum RNA-seq experiments needed when updating parameter_for_buildRmatrix.txt UPDATE_NETWORK_LOG_FILE = '../Data/log/update.network.log.txt' # network update log. We should check this file from time to time. NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt' diff --git a/Code/update_network.py b/Code/update_network.py index f0e4ec2..13e1c11 100755 --- a/Code/update_network.py +++ b/Code/update_network.py @@ -8,7 +8,7 @@ # 1. crontab -e. # 2. Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py # -# IMPORTANT: Make sure execute this script (update_network.py) under the directory Code. +# IMPORTANT: Make sure that you execute this script (update_network.py) under the directory Code. # # Purpose: periodically (e.g., per week) run this script to see if the network needs update. If yes, update it. # @@ -16,7 +16,7 @@ # parameter_for_buildRmatrix.txt and parameter_for_net.txt to make # changes in these file effective. # -# parameter_for_buildCmatrix.txt will be updated automatically (I +# parameter_for_buildRmatrix.txt will be updated automatically (I # hope). However, we need to update parameter_for_buildCmatrix.txt # manually. # @@ -557,45 +557,23 @@ def correlation_mixtools(num_component): def check_rnaseq_info(): - # check rnaseq_info_database.txt and rnaseq_info_database.json, if they are outdated, then remind us to update it in log file. + # check rnaseq_info_database.txt and rnaseq_info_database.json. If they are outdated, then remind us to update it in log file. if os.path.exists(RNA_SEQ_INFO_DATABASE): - if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 90: # older than 120 days + if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 120: # older than 120 days write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE) else: write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE) if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON): - if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 90: + if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 120: write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE) else: write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE) -# def check_process(name): -# ''' If a process name exists, return 1; otherwise return 0.''' -# os.system('ps -eF | grep \'%s\' > ../Data/running_processes.txt' % (name)) -# f = open('../Data/running_processes.txt') -# lines = f.readlines() -# f.close() -# for line in lines: -# line = line.strip() -# lst = line.split() -# if 'python' in lst[-2] and name in lst[-1]: -# return 1 -# return 0 - - - -## main - -# if check_process('update_network.py') == 1: # the old update_network.py is running -# write_log_file('[update_network.py] update_network.py has not finished yet.', UPDATE_NETWORK_LOG_FILE) -# sys.exit() - - - +# main FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \ - MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of very important files + MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of important files make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR) #validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json. @@ -650,7 +628,7 @@ if miss_lst != []: # miss_lst is non-empty in the beginning. # Make json2 (sliced binding.txt) if it does not exist. Copy json2 to -# the web application folder static/edges [manual] for displaying +# the web application folder static/edges [do it manually] for displaying # binding strength plots. if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE): write_log_file('Make directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE) @@ -677,7 +655,7 @@ validate_parameter_for_net(PARAMETER_FOR_NET) if not os.path.exists(FILE_TIMESTAMP): record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP) -# get update time of mandatory files +# Get update time of mandatory files timestamp_dict = read_file_timestamp(FILE_TIMESTAMP) @@ -692,31 +670,6 @@ if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMET os.system(cmd) - # # We will only consider ChIP-seq IDs that are less than 7 days - # # old. Make sure put 'update:yymmdd' in the 'NOTE:' field in - # # parameter_for_buildCmatrix.txt for each newly added ChIP-seq - # # data. - # write_log_file('[update_network.py] Build binding matrix from recently added/modified ChIP-seq data.', UPDATE_NETWORK_LOG_FILE) - # TEMP_BINDING_FILE = BINDING_FILE + '.temp' - # cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, TEMP_BINDING_FILE) - # os.system(cmd) - - # # If someone just touched prameter_for_buildCmatrix.txt without - # # adding any new ChIP-seq data, we should do nothing. - # if validate_binding_file(TEMP_BINDING_FILE): - # write_log_file('[update_network.py] Overwrite binding.txt.', UPDATE_NETWORK_LOG_FILE) - # cm = 'mv %s %s' (TEMP_BINDING_FILE, BINDING_FILE) # Overwrite binding.txt. Make it formal. - # os.system(cmd) - # write_log_file('[update_network.py] binding.txt is updated. Number of columns in %s = %d.' % (BINDING_FILE, num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE) - - # write_log_file('[update_network.py] Update target tf file %s.' % (TARGET_TF_FILE), UPDATE_NETWORK_LOG_FILE) - # cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE) - # os.system(cmd) - # else: - # write_log_file('[update_network.py] [WARNING] Invalid binding matrix.', UPDATE_NETWORK_LOG_FILE) - # os.remove(TEMP_BINDING_FILE) - - updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) if 'binding.txt' in updated_file_list: write_log_file('[update_network.py] binding.txt has been updated. This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE) @@ -748,16 +701,6 @@ if datetime.now().day % PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL == 0: # check write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE) cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX) os.system(cmd) - - # Before we rewrite TPM.txt, we should backup the old TPM.txt - # write_log_file('[update_network.py] Backup %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE) - # cmd = 'cp %s %s' % (TPM_FILE, TPM_FILE + '.backup.at.' + curr_time) - # os.system(cmd) - - # write_log_file('[update_network.py] Rebuild %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE) - # cmd = 'python3 buildRmatrix.py ../Data/parameter/parameter_for_buildRmatrix.txt' - # os.system(cmd) - else: write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt. TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE) @@ -778,7 +721,7 @@ if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMET cmd = 'gzip %s' % (backup_file_name) os.system(cmd) - cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, the location of which is specified in TPM_TABLE in buidlRmatrix.py + cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, whose location is specified in TPM_TABLE in buidlRmatrix.py os.system(cmd) curr_date = datetime.now().strftime('%Y%m%d') @@ -789,7 +732,7 @@ if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMET # Create edges using all RNA-seq experiments updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) -if 'TPM.txt' in updated_file_list: # we could touch TPM.txt to make it recent. We will recompute edges using the full binding.txt. +if 'TPM.txt' in updated_file_list: # we could _touch_ TPM.txt to make it recent. We will recompute edges using the full binding.txt. # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE) cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs. Pay attention to include-all in the command-line argument. @@ -806,7 +749,7 @@ if 'TPM.txt' in updated_file_list: # we could touch TPM.txt to make it recent. ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots write_log_file('[update_network.py] Update directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE) cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET) - #os.system(cmd) # turn this on if we are going to use this bindingtxt for displaying bar charts of binding strengths + #os.system(cmd) # turn this on if we are going to use this binding.txt for displaying bar charts of binding strengths ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual] if False: # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster. @@ -828,7 +771,7 @@ if 'TPM.txt' in updated_file_list: # we could touch TPM.txt to make it recent. wedge() correlation_per_group() correlation_per_group_fixed_number() - correlation_mixtools(2) + correlation_mixtools(2) # two components #correlation_mixtools(3) -- cgit v1.2.1