summaryrefslogtreecommitdiff
path: root/Code
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2020-01-21 11:52:09 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2020-01-21 11:52:09 +0800
commit6701b310b0f2694b681517381ad889ec67012fe0 (patch)
treefea6d914a490fafd5d67aff5d5498813d6849cfb /Code
parent5a70b3b498e64dc903b017d45be09a808cfb2b89 (diff)
merge_edges.py: write all edge information to an SQLite database file called edges.sqlite
When I saved a static html page for each edge (e.g., http://118.25.96.118/static/edges/AT1G20910_AT1G30100_0.html), it took 5GB disk space for saving 1 million html pages. Not very disk space efficient. An alternative is to save all edge information in a database table (i.e., edge), and query this database table for a particular edge. The database file edges.sqlite takes less than 200MB for 1 million edges, requiring 10 times smaller space than the static approach. The reason is that we do not have a lot of HTML tags in the database. Quite happy about that, though it seems that filling a database is a bit slower (2 hours??? for 1 million rows). Also updated two files that were affected: update_network.py and update_network_by_force.py. Now instead of copying 1 million static html page to the Webapp, I just need to copy edges.sqlite to static/edges/. Faster. In the Webapp, I updated start_webapp.py and added a file templates/edge.html for handling dynamic page generation. -Hui
Diffstat (limited to 'Code')
-rw-r--r--Code/merge_edges.py46
-rwxr-xr-xCode/update_network.py2
-rw-r--r--Code/update_network_by_force.py2
3 files changed, 40 insertions, 10 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
index a8bd3b9..5ea1485 100644
--- a/Code/merge_edges.py
+++ b/Code/merge_edges.py
@@ -22,6 +22,7 @@
import os, operator, sys, math, datetime, glob
from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
+import sqlite3
def get_number_of_RNAseq_ids(s):
if s == '.':
@@ -61,14 +62,14 @@ def make_html_page(lst, fname):
head += '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
s = '<html>'
s += '<head>%s</head>\n' % (head)
- body = '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
- body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
- body += '<p>Association strength: %s.</p>\n' % (lst[8])
+ body = '<p>TF: %s</p>\n' % (split_id_and_name(lst[1]))
+ body += '<p>Target: %s</p>\n' % (split_id_and_name(lst[0]))
+ body += '<p>Association strength: %s</p>\n' % (lst[8])
body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
- body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
- body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool. No installation is required. Input data: <a href="json/%s.json">TF gene expression</a> <a href="json/%s.json">Target gene expression</a> <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)
+ body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Gene expression scatter plot</a>\n' % (tf, target)
+ body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool. No installation is required. Input: <a href="json/%s.json">TF gene expression</a> <a href="json/%s.json">Target gene expression</a> <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)
body += '<p id="chart"></p>\n'
s += '<body>%s</body>\n' % (body)
s += '</html>'
@@ -77,6 +78,21 @@ def make_html_page(lst, fname):
f.close()
+def fill_database(lst, conn):
+ ''' Store all edge information in a SQLite database, which can be retrieved in the Webapp.'''
+ tf = lst[1].split()[0] # ID only, no name
+ tf_name = split_id_and_name(lst[1])
+ target = lst[0].split()[0]
+ target_name = split_id_and_name(lst[0])
+ strength = lst[8]
+ edge_date = add_dashes_to_date(lst[7])
+ method = make_html_list(lst[9])
+ evidence = lst[5] if lst[5] != '.' else 'TBA'
+ conn.execute('CREATE TABLE IF NOT EXISTS edge (target_id text, target_name text, tf_id text, tf_name text, strength text, date text, method text, evidence text)')
+ conn.execute('INSERT INTO edge (target_id, target_name, tf_id, tf_name, strength, date, method, evidence) VALUES (?,?,?,?,?,?,?,?)', (target, target_name, tf, tf_name, strength, edge_date, method, evidence))
+ conn.commit()
+
+
def compute_time_difference_in_days(t1, t2):
''' t1 and t2 has this format: yyyymmdd. '''
if not t1.isnumeric() and length(t1) != 8:
@@ -170,8 +186,22 @@ for k in d:
fout.close()
-print('[merge_edges.py]: Make html edge files. May take a while...')
+print('[merge_edges.py]: Make html edge files. May take a while ...')
+
+db_fname = folder_path + '/' + 'edges.sqlite'
+if os.path.exists(db_fname):
+ os.remove(db_fname)
+
+conn = sqlite3.connect(db_fname)
for k in d:
lst = make_new_edge(d[k])
- pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
- make_html_page(lst, folder_path + '/' + pagename)
+ # Make an html page for each edge (taking Big disk space). This will take about 5GB disk space
+ # for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method
+ # to save space.
+ # pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
+ # make_html_page(lst, folder_path + '/' + pagename)
+ # Write to a SQLite database file called edges.sqlite, which will be used for the Webapp.
+ # edges.sqlite will be put under static/edges/ for querying.
+ fill_database(lst, conn)
+
+conn.close()
diff --git a/Code/update_network.py b/Code/update_network.py
index f65b3ba..672dc1e 100755
--- a/Code/update_network.py
+++ b/Code/update_network.py
@@ -865,7 +865,7 @@ if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge
cmd = 'python3 merge_edges.py'
os.system(cmd)
write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
- manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +'
+ manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges'
write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
diff --git a/Code/update_network_by_force.py b/Code/update_network_by_force.py
index 0e56d95..eb33b82 100644
--- a/Code/update_network_by_force.py
+++ b/Code/update_network_by_force.py
@@ -135,7 +135,7 @@ if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge
os.system(cmd)
write_log_file('[update_network_by_force.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
write_log_file('[update_network_by_force.py] %s' % (summarize_edge_file(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
- manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +'
+ manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges'
write_log_file('[update_network_by_force.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
copy_and_backup_file(MERGED_EDGE_FILE, '../Analysis') # the backup file will be used for further analysis