# Usage: python3 update_network_by_force.py # Purpose: update_network.py could take a few days (even weeks) to run. Run this script to harvest new edges before update_network.py finishes. # # Revision history: # 24 Nov 2019, hui # Last modified: 5 Aug 2024, hui import os, sys import glob import time from datetime import datetime from configure import HISTORY_DIR, HISTORY_DIR2, UPDATE_NETWORK_LOG_FILE, MERGED_EDGE_FILE, EDGE_POOL_DIR from configure import PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET from configure import DIFF_EDGE_FILE from backup_files import copy_and_backup_file from log import write_log_file from overlap import Overlap import utils ########## Helper functions ####################### def num_line(fname): ''' Return number of lines in file fname. ''' if not os.path.exists(fname): return 0 f = open(fname) lines = f.readlines() f.close() return len(lines) def lines_with_10_fields(s): result = [] for line in s.split('\n'): line = line.strip() if len(line.split('\t')) == 10: result.append(line) return result def age_of_file_in_seconds(fname): ''' Return age of fname in days. ''' st = os.stat(fname) seconds = time.time() - st.st_mtime return seconds def make_edge_dict_from_files(file_lst): d = {} for fname in file_lst: with open(fname) as f: existing_lines = lines_with_10_fields(f.read()) for line in existing_lines: lst = line.split('\t') k = lst[0] + lst[1] v = hash(''.join(lst[2:])) if not k in d: d[k] = [v] else: d[k].append(v) return d def new_edge_line(line, edge_dict): lst = line.split('\t') k = lst[0] + lst[1] if not k in edge_dict: return True else: existing = edge_dict[k] for x in existing: if x == hash(''.join(lst[2:])): return False return True def concatenate_edge_files(fname_lst, dir_out, fname_out): edge_dict = make_edge_dict_from_files(glob.glob(os.path.join(dir_out, 'edges.txt.*'))) fout = open(os.path.join(dir_out, fname_out), 'w') for fname in fname_lst: with open(fname) as f: s = f.read() # Make sure each edge has 10 fields before writing. lines = lines_with_10_fields(s) if lines != []: # do not write duplicate lines, to save space kept_lines = [] for line in lines: if new_edge_line(line, edge_dict): kept_lines.append(line) if kept_lines != []: fout.write('\n'.join(kept_lines) + '\n') fout.close() def delete_edge_files(fname_lst): age_in_hours = 6 for fname in fname_lst: # Before we delete a file, we should make sure it is not being updated. Make sure it is old enough. Otherwise, don't delete. if age_of_file_in_seconds(fname) > age_in_hours*60*60: # 6 hours os.remove(fname) else: write_log_file('[update_network_by_force.py] In function delete_edge_files. Check file %s. It is probably still being written (age less than %d hours). So I don\'t delete it.' % (fname, age_in_hours), UPDATE_NETWORK_LOG_FILE) def summarize_edge_file(fname): ''' Return number of lines in file fname. ''' if not os.path.exists(fname): return 'File %s does not exist.' % (fname) f = open(fname) tau = 2.0 count_below = 0 count_above = 0 count_total = 0 for line in f: line = line.strip() lst = line.split('\t') if len(lst) == 10: association_strength = float(lst[8]) count_total += 1 if association_strength > tau: count_above += 1 else: count_below += 1 f.close() if count_total > 0: return '#edges above %4.1f: %d (%4.3f percent), #edges below %4.1f: %d (%4.3f percent).' % (tau, count_above, 100.0*count_above/count_total, tau, count_below, 100.0*count_below/count_total) else: return 'Total edges is 0.' ########## Merge edges ####################### # Update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated there ... # Definition of HISTORY_DIR and HISTORY_DIR2 could be found in configure.py time.sleep(10) edge_file_lst = [] # collect edge files (file names). most_recent_edge_modification_time = 0 for history_directory in [HISTORY_DIR, HISTORY_DIR2]: write_log_file('[update_network_by_force.py] Look at edge files in %s.' % (history_directory), UPDATE_NETWORK_LOG_FILE) for fname in glob.glob(os.path.join(history_directory, 'edges.txt.*')): # many small edges.txt.* are to be merged edge_file_lst.append(fname) if os.path.getmtime(fname) > most_recent_edge_modification_time: most_recent_edge_modification_time = os.path.getmtime(fname) if not os.path.exists(MERGED_EDGE_FILE): write_log_file('[update_network_by_force.py] WARNING: missing required file %s.' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE) with open(MERGED_EDGE_FILE, 'w') as f: f.write('') if edge_file_lst == []: write_log_file('[update_network_by_force.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add. # concatenate edge files into one and store in EDGE_POOL_DIR write_log_file('[update_network_by_force.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE) curr_time = datetime.now().strftime('%Y%m%d_%H%M') concatenate_edge_files(edge_file_lst, EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time) # this will update EDGE_POOL_DIR delete_edge_files(edge_file_lst) # delete these files only when they are no longer being written. if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create a new edges.txt write_log_file('[update_network_by_force.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] %s' % (summarize_edge_file(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) Sold = utils.get_edge_set(MERGED_EDGE_FILE) # all old edges stored in a set cmd = 'python3 merge_edges.py' # invoke another script to merge all edge files in EDGE_POOL_DIR return_value = os.system(cmd) if return_value != 0: write_log_file('[update_network_by_force.py] WARNING: something wrong occurred to merge_edges.py. Perhaps your computer is running out of memory.', UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] %s' % (summarize_edge_file(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) Snew = utils.get_edge_set(MERGED_EDGE_FILE) # all new edges stored in a set. Note that MERGED_EDGE_FILE has been updated by 'python3 merge_edges.py' utils.make_new_edges_file(Sold, Snew, MERGED_EDGE_FILE, DIFF_EDGE_FILE) manual_copy_commands = 'MANUAL: Please copy edges.txt to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges curl' write_log_file('[update_network_by_force.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] Make HTML files for the web application.', UPDATE_NETWORK_LOG_FILE) cmd = 'python3 html_network.py -f %s -r %s -c %s -n %s' % (MERGED_EDGE_FILE, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_NET) os.system(cmd) if datetime.now().day % 28 == 0: copy_and_backup_file(MERGED_EDGE_FILE, '../Analysis') # the backup file will be used for further analysis # Compute overlap gold_standard_file = '../Data/temp/AtRegNet.20210208.csv' if os.path.exists(gold_standard_file) and os.path.exists(MERGED_EDGE_FILE): AtRegNet_dict = {} with open(gold_standard_file) as f: for line in f: line = line.strip() lst = line.split(',') if lst[0] != 'TFName' and len(lst) > 4: tf = lst[1].upper().strip() target = lst[4].upper().strip() AtRegNet_dict[tf+target] = 100 BrainEdges_dict = {} with open(MERGED_EDGE_FILE) as f: for line in f: line = line.strip() lst = line.split('\t') tf = lst[1].split()[0] target = lst[0].split()[0] score = float(lst[8]) BrainEdges_dict[tf+target] = score overlap = Overlap(BrainEdges_dict, 3, AtRegNet_dict, 0) write_log_file('[update_network_by_force.py] Performance stats - TP:%d, PP:%d, Hit rate: %4.7f while comparing with AtRegNet.20210208.csv.' % (overlap.getTP(), overlap.getNumberOfPositivesInPred(), overlap.getTP()/(overlap.getNumberOfPositivesInPred()+1)), UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)