diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2024-08-06 16:06:47 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2024-08-06 16:06:47 +0800 |
commit | 173161a747ab2abb4a0d2a7dddecb39ddc9a4589 (patch) | |
tree | b911ac05f48ea815bd4ae700e7216da126bbe14b | |
parent | 2f1b7bc8d9542e3d65007b6658691ae9e5c5e870 (diff) |
update_network.py: delete the 'merge edge' stuff because this is the job of update_network_by_force.py
-rwxr-xr-x | Code/#update_network.py# | 828 | ||||
l--------- | Code/.#update_network.py | 1 | ||||
-rwxr-xr-x | Code/update_network.py | 74 |
3 files changed, 0 insertions, 903 deletions
diff --git a/Code/#update_network.py# b/Code/#update_network.py# deleted file mode 100755 index 4e46fdc..0000000 --- a/Code/#update_network.py# +++ /dev/null @@ -1,828 +0,0 @@ -#! /usr/bin/python3 -# Usage: python3 update_network.py -# Put this script under directory Code/. -# IMPORTANT: Run this script under directory Code/. -# Execute the above command regularly, or -# Cron job this command to make it run everyday at 5am: -# -# 1. crontab -e. -# 2. Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py -# -# IMPORTANT: Make sure that you execute this script (update_network.py) under the directory Code. -# -# Purpose: periodically (e.g., per week) run this script to see if the network needs update. If yes, update it. -# -# Set HOLDON=NO in parameter_for_buildCmatrix.txt, -# parameter_for_buildRmatrix.txt and parameter_for_net.txt to make -# changes in these file effective. -# -# parameter_for_buildRmatrix.txt will be updated automatically (I -# hope). However, we need to update parameter_for_buildCmatrix.txt -# manually. -# -# Revision history: -# -# Last modified: 26 Feb 2017 -# Last modified: 17 Mar 2017 -# Last modified: 04 Apr 2017 -# Last modified: 05 Apr 2017 -# Last modified: 10 Apr 2017 -# Last modified: 19 Apr 2017 -# Last modified: 20 Apr 2017 [addded create_edges0B.py which calls correlation_per_tissue.R] -# Last modified: 21 Jun 2017 [added correlation_per_group.R and wedge.R] -# Last modified: 30 Jun 2017 [added get_sample_size so that we have sample size for correlations of type all, added in ll_dict ] -# Last modified: 23 Jan 2018 [edited a few print-out messages] -# Last modified: 25 Jan 2018 [updated function compute_metric(), set S=365.0 and modified return statement] -# Last modified: 24 Aug 2018 [updated function from get_sample_size(d, sorted_keys, day) to get_sample_size(d, sorted_keys, day, rcond_string)] -# Last modified: 03 Feb 2019 -# Last modified: 08 Aug 2019, hui -# Last modified: 10 Aug 2019, hui <lanhui@zjnu.edu.cn> -# Last modified: 23 Aug 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools(num_component)] -# Last modified: 10 Sep 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools, check the previous R session has finished before starting a new one.] - -import os, sys -import numpy as np -import glob -import time -import subprocess -from datetime import datetime -from param4net import make_global_param_dict, get_key_value -from log import write_log_file -from configure import HISTORY_DIR, HISTORY_DIR2, FILE_TIMESTAMP, SAMPLE_SIZE_FILE, TEMP_DIR, \ - PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, \ - PARAMETER_FOR_NET, PARAMETER_FOR_NET_TRAVADB_STRESS, PARAMETER_FOR_NET_TRAVADB_MAP, PARAMETER_FOR_NET_MILD_DROUGHT, PARAMETER_FOR_NET_WIGGELAB_DIURNAL, \ - BINDING_FILE, TPM_FILE, \ - BUILDRMATRIX_RENEW_INTERVAL, MIN_RNA_SEQ_INCREASE, UPDATE_NETWORK_LOG_FILE, NEW_OR_UPDATED_CHIP_FILE, \ - RNA_SEQ_INFO_DATABASE, RNA_SEQ_INFO_DATABASE_JSON, GENE_ID_FIRST_TWO_LETTERS, MEMORY_STRENGTH, \ - MAPPED_RDATA_DIR, MAPPED_CDATA_DIR, \ - EDGE_POOL_DIR, MERGED_EDGE_FILE, \ - TARGET_TF_FILE - - - -## Helper functions - -def get_value(s, delimit): - lst = s.split(delimit, 1) # only split at the first delimit - return lst[1].strip() - - -def validate_webapp_dir(para_for_net): - ''' Make sure this function is executed under the directory Code. ''' - glb_param_dict = make_global_param_dict(para_for_net) - # if genes.json is not present, create one - if not os.path.exists('../Webapp/static/json/genes.json'): - print('[update_network.py]: cannot find genes.json, make one ...') - cmd = 'python3 text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME']) - os.system(cmd) - - -def make_paths(s): - if not os.path.isdir(s): - os.makedirs(s) - - -def make_important_dirs(): - make_paths('../Data/history/edges/many_targets') - make_paths('../Data/history/edges/one_target') - make_paths('../Data/log') - make_paths('../Data/information') - make_paths('../Data/temp') - make_paths('../Data/upload') - make_paths('../Data/parameter') - make_paths('../Data/R/Mapped') - make_paths('../Data/R/Mapped/public') - make_paths('../Data/R/Mapped/inhouse') - make_paths('../Data/R/Mapped/other') - make_paths('../Data/R/Raw') - make_paths('../Data/C/Mapped') - make_paths('../Data/C/Raw') - make_paths('../Data/history/edges') - make_paths(EDGE_POOL_DIR) - make_paths('../Data/history/bind') - make_paths('../Data/history/expr') - make_paths('../Webapp/static/json') - make_paths('../Webapp/static/edges') - make_paths('../Webapp/templates') - - -def num_line(fname): - ''' Return number of lines in file fname. ''' - if not os.path.exists(fname): - return 0 - f = open(fname) - lines = f.readlines() - f.close() - return len(lines) - - -def num_ids(fname): - ''' Return number of IDs in fname. ''' - f = open(fname) - lines = f.readlines() - f.close() - return len(lines[0].split('\t')) - 1 - - -def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size): - if not os.path.exists(sample_size_file): - f = open(sample_size_file, 'w') - else: - f = open(sample_size_file, 'a') - f.write('%s\t%s\n' % (curr_date, tpm_sample_size)) - f.close() - - -def age_of_file_in_days(fname): - ''' Return age of fname in days. ''' - st = os.stat(fname) - days = (time.time() - st.st_mtime)/(3600*24.0) - return days - - -def age_of_file_in_seconds(fname): - ''' Return age of fname in days. ''' - st = os.stat(fname) - seconds = time.time() - st.st_mtime - return seconds - - -def hold_on(fname): - f = open(fname) - lines = f.readlines() - f.close() - for line in lines[:100]: # check the first 100 lines for HOLDON - line = line.strip() - if line.startswith('%%HOLDON=YES'): - return True - return False - - -def all_files_present(lst): - missing_file_lst = [] - for path in lst: # lst is a list of file names to check - if not os.path.exists(path): - if 'edges.txt' in path: - write_log_file('[update_network.py] WARNING: must have %s to update network. Call create_edges*.py to create edge files.' % (path), UPDATE_NETWORK_LOG_FILE) - missing_file_lst.append(path) - return missing_file_lst - - -def record_file_time(lst, fname): - ''' - lst - a list of files - fname - a recorder file - ''' - f = open(fname, 'w') - s = '' - for x in lst: - if os.path.exists(x): - s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime)) - else: - s += '%s\t%d\n' % (os.path.basename(x), 0) - f.write(s) - f.close() - - -def read_file_timestamp(ftimestamp): - d = {} - f = open(ftimestamp) - for line in f: - line = line.strip() - lst = line.split() - fname = lst[0] - t = lst[1] - d[fname] = int(t) - - f.close() - return d - - -def file_updated(fname, d): - ft = int(os.stat(fname).st_mtime) - k = os.path.basename(fname) - return ft > d[k] - - -def get_updated_files(lst, d): - result = [] - for x in lst: - if file_updated(x, d): - result.append(os.path.basename(x)) - return result - - -def get_sample_size(d, sorted_keys, day, rcond_string): - - if rcond_string.isdigit(): - return int(rcond_string) - - if len(d) == 0: - return 1200 # a default number of sample size, CHANGE - - for x in sorted_keys: - if x >= day: - return d[x] - - k = sorted_keys[-1] # last key, latest date - return d[k] - - -def number_rnaseq_id(tpm_file): - f = open(tpm_file) - first_line = f.readlines()[0] - f.close() - first_line = first_line.strip() - return len(first_line.split()) - 1 - - -def number_rnaseq_diff(para_file, tpm_file): - ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference ''' - a = 0 - f = open(para_file) - for line in f: - line = line.strip() - if line.startswith('@'): - a += 1 - f.close() - - b = number_rnaseq_id(tpm_file) - - return a - b - - -def validate_gene_file(fname): - f = open(fname) - lines = f.readlines() - f.close() - for line in lines: # check all lines - line = line.strip() - lst = line.split('\t') - if len(lst) < 6: - print('[update_network.py]:Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst))) - sys.exit() - - -def validate_parameter_for_buildcmatrix(fname): - # first the file must exist - if not os.path.exists(fname): - print('[update_network.py]:CANNOT FIND %s.' % (fname)) - sys.exit() - f = open(fname) - lines = f.readlines() - f.close() - d = {} - location_count = 0 - for line in lines: - line = line.strip() - if line.startswith('%%'): - k, v = get_key_value(line[2:]) - d[k] = v - if k == 'GENE_FILE' or k == 'CHR_INFO': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if k == 'GENE_FILE': - validate_gene_file(v) - if k == 'DESTINATION': - if not os.path.isdir(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if k == 'TARGET_RANGE': - if int(v) <= 0: - print('[update_network.py]:Target range (%d) must be greater than 0.' % (v)) - sys.exit() - if line.startswith('LOCATION:'): - v = get_value(line, ':') - location_count += 1 - if not os.path.exists(v): - print('[Warning] update_network.py: Location %s does not exists.' % (v)) - #sys.exit() - - if not 'GENE_FILE' in d: - print('[update_network.py]:Must specify GENE_FILE.') - sys.exit() - if not 'DESTINATION' in d: - print('[update_network.py]:Must specify DESTINATION.') - sys.exit() - if not 'CHR_INFO' in d: - print('[update_network.py]:Must specify CHR_INFO.') - sys.exit() - if location_count == 0: - print('[update_network.py]:Must contain at least one ChIP-seq.') - sys.exit() - - -def validate_parameter_for_buildrmatrix(fname): - # first the file must exist - if not os.path.exists(fname): - print('[update_network.py]:CANNOT FIND %s.' % (fname)) - sys.exit() - f = open(fname) - lines = f.readlines() - f.close() - d = {} - location_count = 0 - for line in lines: - line = line.strip() - if line.startswith('%%'): - k, v = get_key_value(line[2:]) - d[k] = v - if k == 'GENE_LIST': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if line.startswith('LOCATION:'): - v = get_value(line, ':') - location_count += 1 - if not os.path.exists(v): - print('[update_network.py]:Location %s does not exists.' % (v)) - #sys.exit() - - if not 'GENE_LIST' in d: - print('[update_network.py]:Must specify GENE_LIST.') - sys.exit() - if location_count == 0: - print('[update_network.py]:Must contain at least one RNA-seq.') - sys.exit() - - -def validate_parameter_for_net(fname): - # first the file must exist - if not os.path.exists(fname): - print('[update_network.py]:CANNOT FIND %s.' % (fname)) - sys.exit() - f = open(fname) - lines = f.readlines() - f.close() - d = {} - location_count = 0 - for line in lines: - line = line.strip() - if line.startswith('%%'): - k, v = get_key_value(line[2:]) - d[k] = v - if k == 'GENE_LIST': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if k == 'GENE_ID_AND_GENE_NAME': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if k == 'BINDING_INFO': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if k == 'EXPRESSION_INFO': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - sys.exit() - if k == 'BINDING_MATRIX': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.') - if k == 'EXPRESSION_MATRIX': - if not os.path.exists(v): - print('[update_network.py]:%s not exists.' % (v)) - print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.') - - if not 'GENE_LIST' in d: - print('[update_network.py]:Must specify GENE_FILE.') - sys.exit() - if not 'GENE_ID_AND_GENE_NAME' in d: - print('[update_network.py]:Must specify GENE_ID_AND_GENE_NAME.') - sys.exit() - if not 'BINDING_INFO' in d: - print('[update_network.py]:Must specify BINDING_INFO.') - sys.exit() - if not 'EXPRESSION_INFO' in d: - print('[update_network.py]:Must specify EXPRESSION_INFO.') - sys.exit() - if not 'BINDING_MATRIX' in d: - print('[update_network.py]:%s not exists.' % (v)) - print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.') - if not 'EXPRESSION_MATRIX' in d: - print('[update_network.py]:%s not exists.' % (v)) - print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.') - - - -def need_update_parameter_file(param_file, dirs): - ''' Make sure param_file is consistent with dirs (a list of directories to check against). ''' - result = [] - - files_in_parameter = {} - f = open(param_file) - for line in f: - line = line.strip() - if line.startswith('LOCATION:'): - lst = line.split(':') - k = os.path.abspath(lst[1]) - files_in_parameter[k] = 1 - f.close() - param_modification_time = os.path.getmtime(param_file) - - files_in_dirs = {} - for directory in dirs: - for root, dirnames, filenames in os.walk(os.path.abspath(directory)): - for filename in filenames: - k = os.path.join(root, filename) - files_in_dirs[k] = 1 - if 'narrowPeak' in k or '_quant' in k: - if not k in files_in_parameter and os.path.getmtime(k) > param_modification_time: - result.append('%s is not in %s' % (k, param_file)) - - return result - - - -def validate_binding_file(fname): - f = open(fname) - lines = f.readlines() - for line in lines: - line = line.strip() - if 'buildCmatrix: ChIP-seq ID list is empty.' in line: - return False - f.close() - return True - - -def lines_with_10_fields(s): - result = [] - for line in s.split('\n'): - line = line.strip() - if len(line.split('\t')) == 10: - result.append(line) - return result - - -def concatenate_edge_files(fname_lst, fname_out): - fout = open(fname_out, 'w') - for fname in fname_lst: - f = open(fname) - s = f.read() - f.close() - # Make sure each edge has 10 fields before writing. - lines = lines_with_10_fields(s) - if lines != []: - write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE) - fout.write('\n'.join(lines) + '\n') - else: - write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE) - fout.close() - - -def delete_edge_files(fname_lst): - for fname in fname_lst: - # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete. - if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes - os.remove(fname) - else: - write_log_file('[update_network.py] In function delete_edge_files. Check file %s. It is probably still being written. So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE) - - -def create_edges0(): - if os.path.exists(PARAMETER_FOR_NET): - write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET) - os.system(cmd) - - # The following commands are optional. For example, if a user wants to run it locally, he don't have to provide these TPM tables. - if os.path.exists(PARAMETER_FOR_NET_TRAVADB_STRESS): - #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS) - #os.system(cmd) - - if os.path.exists(PARAMETER_FOR_NET_TRAVADB_MAP): - #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_MAP), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_MAP) - #os.system(cmd) - - if os.path.exists(PARAMETER_FOR_NET_MILD_DROUGHT): - #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_MILD_DROUGHT), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_MILD_DROUGHT) - #os.system(cmd) - - if os.path.exists(PARAMETER_FOR_NET_WIGGELAB_DIURNAL): - #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL) - #os.system(cmd) - - -def create_edges0B(): - if os.path.exists(PARAMETER_FOR_NET): - write_log_file('[update_network.py] Create tissue-specific edges.txt using new binding.txt (size=%d). create_edges0B.py' % (num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 create_edges0B.py %s' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R - os.system(cmd) - - -def wedge(): - if os.path.exists(PARAMETER_FOR_NET): - write_log_file('[update_network.py] Create edges using wedge shapes. wedge.R', UPDATE_NETWORK_LOG_FILE) - cmd = 'Rscript wedge.R' - os.system(cmd) - - -def correlation_per_group(): - # For 3,130 RNA-seq samples and 30,000 pairs, need at least 10 hours. - if os.path.exists(PARAMETER_FOR_NET): - write_log_file('[update_network.py] Create group-specific edges.txt using new TPM.txt (size=%d). correlation_per_group.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE) - cmd = 'Rscript correlation_per_group.R' - os.system(cmd) - - -def correlation_per_group_fixed_number(): - if os.path.exists(PARAMETER_FOR_NET): - write_log_file('[update_network.py] Create group-specific (fixed) edges.txt using new TPM.txt (size=%d). correlation_per_group_fixed_number.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE) - cmd = 'Rscript correlation_per_group_fixed_number.R' - os.system(cmd) - - -def correlation_mixtools(num_component): - if os.system('pidof R') != 0: # since it take long time (several days) to run create_edges_mixtool.R, so we make sure the previous R computing has finished before we start a new one. os.system returns 0 if R is running. - write_log_file('[update_network.py] Create edges.txt using TPM.txt (size=%d). create_edges_mixtool.R with %d components.' % (number_rnaseq_id(TPM_FILE), num_component), UPDATE_NETWORK_LOG_FILE) - cmd = 'Rscript create_edges_mixtool.R %d' % (num_component) - os.system(cmd) - - -def check_rnaseq_info(): - # check rnaseq_info_database.txt and rnaseq_info_database.json. If they are outdated, then remind us to update it in log file. - if os.path.exists(RNA_SEQ_INFO_DATABASE): - if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 120: # older than 120 days - write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE) - else: - write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE) - - if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON): - if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 120: - write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE) - else: - write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE) - - -# main -FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \ - MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of important files - -make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR) -#validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json. - -check_rnaseq_info() # rnaseq informtion is useful for displaying scatterplots - -# Make sure all necessary files are present, if not, make them if possible -miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing -if miss_lst != []: # miss_lst is non-empty in the beginning. - print('These mandatory files are missing: %s.\nPrepare them first.' % (' '.join(miss_lst))) - write_log_file('[update_network.py] Cannot find these required files:%s' % (' '.join(miss_lst)), UPDATE_NETWORK_LOG_FILE) - - # initially, we (at most) only have three parameter files, no binding.txt, TPM.txt or edges.txt ... - important_miss_number = 0 - if PARAMETER_FOR_BUILDCMATRIX in miss_lst: - print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDCMATRIX)) - important_miss_number += 1 - - if PARAMETER_FOR_BUILDRMATRIX in miss_lst: - print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDRMATRIX)) - important_miss_number += 1 - - if PARAMETER_FOR_NET in miss_lst: - print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_NET)) - important_miss_number += 1 - - if important_miss_number > 0: - sys.exit() # need to provide all the above three files; otherwise cannot proceed - - if BINDING_FILE in miss_lst: - print('[update_network.py]: make initial binding.txt ... wait') - write_log_file('[update_network.py] Make initial binding.txt', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) - #os.system(cmd) - cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) - #os.system(cmd) - print('[update_network.py]: IMPORATNT: make sure BINDING_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE)) - sys.exit() - - if TPM_FILE in miss_lst: - print('[update_network.py]: make initial TPM.txt ... wait') - write_log_file('[update_network.py] Make initial TPM.txt', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt - os.system(cmd) - print('[update_network.py]:IMPORTANT: make sure EXPRESSION_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, TPM_FILE)) - sys.exit() - - miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again - if len(miss_lst2) == 1 and miss_lst2[0] == MERGED_EDGE_FILE: # all other files are ready except edges.txt, make one. - print('[update_network.py]: make initial edges.txt ... wait') - create_edgeds0() - - -# Make json2 (sliced binding.txt) if it does not exist. Copy json2 to -# the web application folder static/edges [do it manually] for displaying -# binding strength plots. -if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE): - write_log_file('Make directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET) - os.system(cmd) - - -# Make json (sliced TPM.txt) if it does not exist. Copy json to the -# web application folder static/edges [manual] for displaying gene -# expression scatterplots. -if not os.path.isdir('../Data/history/expr/json') and os.path.exists(TPM_FILE): - write_log_file('Make directory ../Data/history/expr/json. Don\'t forget to copy json to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET) - os.system(cmd) - - -# Make sure parameter files are present and valid (rudimentary check but important) -validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX) -validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX) -validate_parameter_for_net(PARAMETER_FOR_NET) - - -# If the file timestamp does not exist, create one -if not os.path.exists(FILE_TIMESTAMP): - record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP) - -# Get update time of mandatory files -timestamp_dict = read_file_timestamp(FILE_TIMESTAMP) - - - -################## binding.txt stuff ##################################### -# Check parameter_for_buildCmatrix.txt -updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) -if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDCMATRIX): - write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDCMATRIX), UPDATE_NETWORK_LOG_FILE) - write_log_file('[update_network.py] Make binding column files', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) # won't re-compute existing binding columns unless updated - os.system(cmd) - - -updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) -if 'binding.txt' in updated_file_list: - write_log_file('[update_network.py] binding.txt has been updated. This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE) - # create_edges0() - # create_edges0B() - # wedge() - # correlation_per_group() - # correlation_per_group_fixed_number() - # correlation_mixtools(2) - # correlation_mixtools(3) - - ## TODO mixtool stuff, forget it for now. - #cmd = 'nohup python3 create_edges4.py %s &' % (temp_file_name) - #os.system(cmd) - - - - -################## TPM.txt stuff ##################################### - -# update parameter_for_buildRmatrix.txt periodically and automatically. -if datetime.now().day % BUILDRMATRIX_RENEW_INTERVAL == 0: # check if need to update parameter_for_buildRmatrix.txt bi-weekly - curr_time = datetime.now().strftime('%Y%m%d%H%M') - new_parameter_file = '../Data/temp/parameter_for_buildRmatrix.%s' % (curr_time) - cmd = 'python3 make_parameter_rnaseq.py > %s' % (new_parameter_file) # new_parameter_file will not be updated unless download_and_map.py has finished. - os.system(cmd) - num = number_rnaseq_diff(new_parameter_file, TPM_FILE) - if num >= MIN_RNA_SEQ_INCREASE: # sufficient number of RNA-seq samples have been added - write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE) - cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX) - os.system(cmd) - else: - write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt. TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE) - - -# Check if parameter_for_buildRmatrix.txt has been updated -updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) -# TODO To simplify things, I will provide TPM.txt directly. So set the -# HOLDON option to YES in parameter_for_buildRmatrix.txt to prevent -# the following from being True. -if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDRMATRIX): - write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE) - write_log_file('[update_network.py] Rebuild TPM.txt ...', UPDATE_NETWORK_LOG_FILE) - curr_time = datetime.now().strftime('%Y%m%d%H%M%S') - if os.path.exists(TPM_FILE): - backup_file_name = '../Data/history/expr/TPM.txt.backup.at.%s' % (curr_time) - cmd = 'cp %s %s' % (TPM_FILE, backup_file_name) - os.system(cmd) - cmd = 'gzip %s' % (backup_file_name) - os.system(cmd) - - cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, whose location is specified in TPM_TABLE in buidlRmatrix.py - os.system(cmd) - - curr_date = datetime.now().strftime('%Y%m%d') - tpm_sample_size = number_rnaseq_id(TPM_FILE) - write_sample_size_file(SAMPLE_SIZE_FILE, curr_date, tpm_sample_size) - - - -# Create edges using all RNA-seq experiments -updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) -if 'TPM.txt' in updated_file_list: # we could _touch_ TPM.txt to make it recent. We will recompute edges using the full binding.txt. - # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges - write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs. Pay attention to include-all in the command-line argument. - os.system(cmd) - - # target_tf.txt - write_log_file('[update_network.py] Make target_tf.txt.', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE) - os.system(cmd) - - write_log_file('[update_network.py] Update ../Data/history/expr/json using the new TPM.txt. Don\'t forget to update the static/edges/json folder in the web application.', UPDATE_NETWORK_LOG_FILE) - ## json -- make/renew json directory for displaying scatterplots - cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET) - ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots - write_log_file('[update_network.py] Update directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET) - #os.system(cmd) # turn this on if we are going to use this binding.txt for displaying bar charts of binding strengths - ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual] - - if False: # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster. - write_log_file('Assign tissue, refine tissue and update rnaseq_info_database.json', UPDATE_NETWORK_LOG_FILE) - os.environ["PYTHONIOENCODING"] = "UTF-8" # for non-ascii letters in ENA RNA-sample description. If this statement does not work, try 'export PYTHONIOENCODING=UTF-8' in the command line instead. The export command can be put in crontab -e before running this script - cmd = 'python3 assign_tissue.py' - os.system(cmd) - cmd = 'python3 refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt' - os.system(cmd) - cmd = 'python3 update_rnaseq_info_json.py' - os.system(cmd) - - - - # Compute edges. This could take a lot of time so update FILE_TIMESTAMP first. - record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP) - create_edges0() - create_edges0B() - wedge() - correlation_per_group() - correlation_per_group_fixed_number() - correlation_mixtools(2) # two components - #correlation_mixtools(3) - - -########## Merge edges ####################### -# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ... -time.sleep(5) -edge_file_lst = [] # collect edge files. -most_recent_edge_modification_time = 0 -write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE) -for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged - edge_file_lst.append(fname) - if os.path.getmtime(fname) > most_recent_edge_modification_time: - most_recent_edge_modification_time = os.path.getmtime(fname) - -write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) -for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged - edge_file_lst.append(fname) - if os.path.getmtime(fname) > most_recent_edge_modification_time: - most_recent_edge_modification_time = os.path.getmtime(fname) - - -if edge_file_lst == []: - write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) -elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add. - # concatenate edge files into one - write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) - curr_time = datetime.now().strftime('%Y%m%d_%H%M') - concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time)) - delete_edge_files(edge_file_lst) - -if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt - write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE) - write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 merge_edges.py' - os.system(cmd) - write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) - manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges' - write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE) - - -# exclude edges as suggested by Phil Wigge. -# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE) -# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE) -#os.system(cmd) - -# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges. -# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) -# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET): -# write_log_file('Create edges.txt using new TPM.txt (size=%d) ...' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE) -# time.sleep(7200) # wait one hour for the previous create_edges4.py (if any) to finish creating JSON_DIR and target_tf_fname -# cmd = 'nohup python3 create_edges4.py %s &' % (PARAMETER_FOR_NET) # put process to background -# os.system(cmd) -# time.sleep(60) - - -# remove .R files in ../Data/temp. Files older than 3 days will be removed -cmd = 'find %s -mtime +2 -name \"*.R\" -delete' % (TEMP_DIR) -os.system(cmd) - -# update time stamp file -record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP) - -write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE) - diff --git a/Code/.#update_network.py b/Code/.#update_network.py deleted file mode 120000 index 66ec7a4..0000000 --- a/Code/.#update_network.py +++ /dev/null @@ -1 +0,0 @@ -lanhui@VM-0-14-ubuntu.13592:1688234417
\ No newline at end of file diff --git a/Code/update_network.py b/Code/update_network.py index d5aed9b..7b26f58 100755 --- a/Code/update_network.py +++ b/Code/update_network.py @@ -436,7 +436,6 @@ def need_update_parameter_file(param_file, dirs): return result - def validate_binding_file(fname): f = open(fname) lines = f.readlines() @@ -448,40 +447,6 @@ def validate_binding_file(fname): return True -def lines_with_10_fields(s): - result = [] - for line in s.split('\n'): - line = line.strip() - if len(line.split('\t')) == 10: - result.append(line) - return result - - -def concatenate_edge_files(fname_lst, fname_out): - fout = open(fname_out, 'w') - for fname in fname_lst: - f = open(fname) - s = f.read() - f.close() - # Make sure each edge has 10 fields before writing. - lines = lines_with_10_fields(s) - if lines != []: - write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE) - fout.write('\n'.join(lines) + '\n') - else: - write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE) - fout.close() - - -def delete_edge_files(fname_lst): - for fname in fname_lst: - # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete. - if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes - os.remove(fname) - else: - write_log_file('[update_network.py] In function delete_edge_files. Check file %s. It is probably still being written. So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE) - - def create_edges0(): if os.path.exists(PARAMETER_FOR_NET): write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE) @@ -765,49 +730,11 @@ if 'TPM.txt' in updated_file_list: # we could _touch_ TPM.txt to make it recent. #correlation_mixtools(3) -########## Merge edges ####################### -# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ... -time.sleep(5) -edge_file_lst = [] # collect edge files. -most_recent_edge_modification_time = 0 -write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE) -for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged - edge_file_lst.append(fname) - if os.path.getmtime(fname) > most_recent_edge_modification_time: - most_recent_edge_modification_time = os.path.getmtime(fname) - -write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) -for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged - edge_file_lst.append(fname) - if os.path.getmtime(fname) > most_recent_edge_modification_time: - most_recent_edge_modification_time = os.path.getmtime(fname) - - -if edge_file_lst == []: - write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) -elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add. - # concatenate edge files into one - write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) - curr_time = datetime.now().strftime('%Y%m%d_%H%M') - concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time)) - delete_edge_files(edge_file_lst) - -if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt - write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE) - write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) - cmd = 'python3 merge_edges.py' - os.system(cmd) - write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) - manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges' - write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE) - - # exclude edges as suggested by Phil Wigge. # write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE) # cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE) #os.system(cmd) - # # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges. # updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) # if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET): @@ -826,4 +753,3 @@ os.system(cmd) record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP) write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE) - |