summaryrefslogtreecommitdiff
path: root/Code/update_network_by_force.py
blob: 1478cc08cd26081aaa469607f82712e0ec022759 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Usage: python3 update_network_by_force.py
# Purpose: update_network.py could take a few days (even weeks) to run.  Run this script to harvest new edges before update_network.py finishes.
#
# Revision history:
# 24 Nov 2019, hui <lanhui@zjnu.edu.cn>
# Last modified: 5 Aug 2024, hui <lanhui@zjnu.edu.cn>

import os, sys
import glob
import time
from datetime import datetime
from configure import HISTORY_DIR, HISTORY_DIR2, UPDATE_NETWORK_LOG_FILE, MERGED_EDGE_FILE, EDGE_POOL_DIR
from configure import PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET
from configure import DIFF_EDGE_FILE
from backup_files import copy_and_backup_file
from log import write_log_file
from overlap import Overlap
import utils

########## Helper functions #######################
def num_line(fname):
    ''' Return number of lines in file fname. '''
    if not os.path.exists(fname):
        return 0
    f = open(fname)
    lines = f.readlines()
    f.close()
    return len(lines)


def lines_with_10_fields(s):
    result = []
    for line in s.split('\n'):
        line = line.strip()
        if len(line.split('\t')) == 10:
            result.append(line)
    return result


def age_of_file_in_seconds(fname):
    ''' Return age of fname in days. '''
    st = os.stat(fname)
    seconds = time.time() - st.st_mtime
    return seconds


def make_edge_dict_from_files(file_lst):
    d = {}
    for fname in file_lst:
        with open(fname) as f:
            existing_lines = lines_with_10_fields(f.read())
            for line in existing_lines:
                lst = line.split('\t')
                k = lst[0] + lst[1]
                v = hash(''.join(lst[2:]))
                if not k in d:
                    d[k] = [v]
                else:
                    d[k].append(v)
    return d


def new_edge_line(line, edge_dict):
    lst = line.split('\t')
    k = lst[0] + lst[1]
    if not k in edge_dict:
        return True
    else:
        existing = edge_dict[k]
        for x in existing:
            if x == hash(''.join(lst[2:])):
                return False
    return True


def concatenate_edge_files(fname_lst, dir_out, fname_out):
    edge_dict = make_edge_dict_from_files(glob.glob(os.path.join(dir_out, 'edges.txt.*')))
    fout = open(os.path.join(dir_out, fname_out), 'w')
    for fname in fname_lst:
        with open(fname) as f:
            s = f.read()
        # Make sure each edge has 10 fields before writing.
        lines = lines_with_10_fields(s)
        if lines != []:
            # do not write duplicate lines, to save space
            kept_lines = []
            for line in lines:
                if new_edge_line(line, edge_dict):
                    kept_lines.append(line)
            if kept_lines != []:
                fout.write('\n'.join(kept_lines) + '\n')
    fout.close()


def delete_edge_files(fname_lst):
    age_in_hours = 6
    for fname in fname_lst:
        # Before we delete a file, we should make sure it is not being updated. Make sure it is old enough. Otherwise, don't delete.
        if age_of_file_in_seconds(fname) > age_in_hours*60*60: # 6 hours
            os.remove(fname)
        else:
            write_log_file('[update_network_by_force.py] In function delete_edge_files. Check file %s.  It is probably still being written (age less than %d hours).  So I don\'t delete it.' % (fname, age_in_hours), UPDATE_NETWORK_LOG_FILE)


def summarize_edge_file(fname):
    ''' Return number of lines in file fname. '''
    if not os.path.exists(fname):
        return 'File %s does not exist.' % (fname)
    f = open(fname)
    tau = 2.0
    count_below = 0
    count_above = 0
    count_total = 0
    for line in f:
        line = line.strip()
        lst = line.split('\t')
        if len(lst) == 10:
            association_strength = float(lst[8])
            count_total += 1
            if association_strength > tau:
                count_above += 1
            else:
                count_below += 1
    f.close()
    if count_total > 0:
        return '#edges above %4.1f: %d (%4.3f percent), #edges below %4.1f: %d (%4.3f percent).' % (tau, count_above, 100.0*count_above/count_total, tau, count_below, 100.0*count_below/count_total)
    else:
        return 'Total edges is 0.'


########## Merge edges #######################
# Update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated there ...
# Definition of HISTORY_DIR and HISTORY_DIR2 could be found in configure.py
time.sleep(10)
edge_file_lst = [] # collect edge files (file names).
most_recent_edge_modification_time = 0

for history_directory in [HISTORY_DIR, HISTORY_DIR2]:
    write_log_file('[update_network_by_force.py] Look at edge files in %s.' % (history_directory), UPDATE_NETWORK_LOG_FILE)
    for fname in glob.glob(os.path.join(history_directory, 'edges.txt.*')): # many small edges.txt.* are to be merged
        edge_file_lst.append(fname)
        if os.path.getmtime(fname) > most_recent_edge_modification_time:
            most_recent_edge_modification_time = os.path.getmtime(fname)

if not os.path.exists(MERGED_EDGE_FILE):
    write_log_file('[update_network_by_force.py] WARNING: missing required file %s.' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE)
    with open(MERGED_EDGE_FILE, 'w') as f:
        f.write('')

if edge_file_lst == []:
    write_log_file('[update_network_by_force.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
    # concatenate edge files into one and store in EDGE_POOL_DIR
    write_log_file('[update_network_by_force.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)    
    curr_time = datetime.now().strftime('%Y%m%d_%H%M')
    concatenate_edge_files(edge_file_lst, EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time) # this will update EDGE_POOL_DIR
    delete_edge_files(edge_file_lst) # delete these files only when they are no longer being written.

if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create a new edges.txt
    write_log_file('[update_network_by_force.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network_by_force.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network_by_force.py] %s' % (summarize_edge_file(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
    Sold = utils.get_edge_set(MERGED_EDGE_FILE) # all old edges stored in a set
    cmd = 'python3 merge_edges.py'  # invoke another script to merge all edge files in EDGE_POOL_DIR
    return_value = os.system(cmd)
    if return_value != 0:
        write_log_file('[update_network_by_force.py] WARNING: something wrong occurred to merge_edges.py.  Perhaps your computer is running out of memory.', UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network_by_force.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network_by_force.py] %s' % (summarize_edge_file(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
    Snew = utils.get_edge_set(MERGED_EDGE_FILE) # all new edges stored in a set. Note that MERGED_EDGE_FILE has been updated by 'python3 merge_edges.py'
    utils.make_new_edges_file(Sold, Snew, MERGED_EDGE_FILE, DIFF_EDGE_FILE)
    manual_copy_commands = 'MANUAL: Please copy edges.txt to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges curl http://118.25.96.118/brain/before'
    write_log_file('[update_network_by_force.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network_by_force.py] Make HTML files for the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 html_network.py -f %s -r %s -c %s -n %s' % (MERGED_EDGE_FILE, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_NET)
    os.system(cmd)
    if datetime.now().day % 28 == 0:
        copy_and_backup_file(MERGED_EDGE_FILE, '../Analysis') # the backup file will be used for further analysis

# Compute overlap

gold_standard_file = '../Data/temp/AtRegNet.20210208.csv'
if os.path.exists(gold_standard_file) and os.path.exists(MERGED_EDGE_FILE):
    AtRegNet_dict = {}
    with open(gold_standard_file) as f:
        for line in f:
            line = line.strip()
            lst = line.split(',')
            if lst[0] != 'TFName' and len(lst) > 4:
                tf = lst[1].upper().strip()
                target = lst[4].upper().strip()
                AtRegNet_dict[tf+target] = 100

    BrainEdges_dict = {}
    with open(MERGED_EDGE_FILE) as f:
        for line in f:
            line = line.strip()
            lst = line.split('\t')
            tf = lst[1].split()[0]
            target = lst[0].split()[0]
            score = float(lst[8])
            BrainEdges_dict[tf+target] = score

    overlap = Overlap(BrainEdges_dict, 3, AtRegNet_dict, 0)
    write_log_file('[update_network_by_force.py] Performance stats - TP:%d, PP:%d, Hit rate: %4.7f while comparing with AtRegNet.20210208.csv.' % (overlap.getTP(), overlap.getNumberOfPositivesInPred(), overlap.getTP()/(overlap.getNumberOfPositivesInPred()+1)), UPDATE_NETWORK_LOG_FILE)

write_log_file('[update_network_by_force.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)