diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2020-01-21 11:52:09 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2020-01-21 11:52:09 +0800 |
commit | 6701b310b0f2694b681517381ad889ec67012fe0 (patch) | |
tree | fea6d914a490fafd5d67aff5d5498813d6849cfb | |
parent | 5a70b3b498e64dc903b017d45be09a808cfb2b89 (diff) |
merge_edges.py: write all edge information to an SQLite database file called edges.sqlite
When I saved a static html page for each edge (e.g., http://118.25.96.118/static/edges/AT1G20910_AT1G30100_0.html), it took
5GB disk space for saving 1 million html pages. Not very disk space efficient.
An alternative is to save all edge information in a database table (i.e., edge), and query this database table for a
particular edge.
The database file edges.sqlite takes less than 200MB for 1 million edges, requiring 10 times smaller space than the static
approach. The reason is that we do not have a lot of HTML tags in the database. Quite happy about that, though it seems
that filling a database is a bit slower (2 hours??? for 1 million rows).
Also updated two files that were affected: update_network.py and update_network_by_force.py. Now instead of copying 1 million
static html page to the Webapp, I just need to copy edges.sqlite to static/edges/. Faster.
In the Webapp, I updated start_webapp.py and added a file templates/edge.html for handling dynamic page generation.
-Hui
-rw-r--r-- | Code/merge_edges.py | 46 | ||||
-rwxr-xr-x | Code/update_network.py | 2 | ||||
-rw-r--r-- | Code/update_network_by_force.py | 2 |
3 files changed, 40 insertions, 10 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py index a8bd3b9..5ea1485 100644 --- a/Code/merge_edges.py +++ b/Code/merge_edges.py @@ -22,6 +22,7 @@ import os, operator, sys, math, datetime, glob from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE +import sqlite3 def get_number_of_RNAseq_ids(s): if s == '.': @@ -61,14 +62,14 @@ def make_html_page(lst, fname): head += '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>' s = '<html>' s += '<head>%s</head>\n' % (head) - body = '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1])) - body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0])) - body += '<p>Association strength: %s.</p>\n' % (lst[8]) + body = '<p>TF: %s</p>\n' % (split_id_and_name(lst[1])) + body += '<p>Target: %s</p>\n' % (split_id_and_name(lst[0])) + body += '<p>Association strength: %s</p>\n' % (lst[8]) body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7])) body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9])) body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA') - body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target) - body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool. No installation is required. Input data: <a href="json/%s.json">TF gene expression</a> <a href="json/%s.json">Target gene expression</a> <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target) + body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Gene expression scatter plot</a>\n' % (tf, target) + body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool. No installation is required. Input: <a href="json/%s.json">TF gene expression</a> <a href="json/%s.json">Target gene expression</a> <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target) body += '<p id="chart"></p>\n' s += '<body>%s</body>\n' % (body) s += '</html>' @@ -77,6 +78,21 @@ def make_html_page(lst, fname): f.close() +def fill_database(lst, conn): + ''' Store all edge information in a SQLite database, which can be retrieved in the Webapp.''' + tf = lst[1].split()[0] # ID only, no name + tf_name = split_id_and_name(lst[1]) + target = lst[0].split()[0] + target_name = split_id_and_name(lst[0]) + strength = lst[8] + edge_date = add_dashes_to_date(lst[7]) + method = make_html_list(lst[9]) + evidence = lst[5] if lst[5] != '.' else 'TBA' + conn.execute('CREATE TABLE IF NOT EXISTS edge (target_id text, target_name text, tf_id text, tf_name text, strength text, date text, method text, evidence text)') + conn.execute('INSERT INTO edge (target_id, target_name, tf_id, tf_name, strength, date, method, evidence) VALUES (?,?,?,?,?,?,?,?)', (target, target_name, tf, tf_name, strength, edge_date, method, evidence)) + conn.commit() + + def compute_time_difference_in_days(t1, t2): ''' t1 and t2 has this format: yyyymmdd. ''' if not t1.isnumeric() and length(t1) != 8: @@ -170,8 +186,22 @@ for k in d: fout.close() -print('[merge_edges.py]: Make html edge files. May take a while...') +print('[merge_edges.py]: Make html edge files. May take a while ...') + +db_fname = folder_path + '/' + 'edges.sqlite' +if os.path.exists(db_fname): + os.remove(db_fname) + +conn = sqlite3.connect(db_fname) for k in d: lst = make_new_edge(d[k]) - pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html - make_html_page(lst, folder_path + '/' + pagename) + # Make an html page for each edge (taking Big disk space). This will take about 5GB disk space + # for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method + # to save space. + # pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html + # make_html_page(lst, folder_path + '/' + pagename) + # Write to a SQLite database file called edges.sqlite, which will be used for the Webapp. + # edges.sqlite will be put under static/edges/ for querying. + fill_database(lst, conn) + +conn.close() diff --git a/Code/update_network.py b/Code/update_network.py index f65b3ba..672dc1e 100755 --- a/Code/update_network.py +++ b/Code/update_network.py @@ -865,7 +865,7 @@ if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge cmd = 'python3 merge_edges.py' os.system(cmd) write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) - manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +' + manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges' write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE) diff --git a/Code/update_network_by_force.py b/Code/update_network_by_force.py index 0e56d95..eb33b82 100644 --- a/Code/update_network_by_force.py +++ b/Code/update_network_by_force.py @@ -135,7 +135,7 @@ if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge os.system(cmd) write_log_file('[update_network_by_force.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) write_log_file('[update_network_by_force.py] %s' % (summarize_edge_file(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE) - manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +' + manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges' write_log_file('[update_network_by_force.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE) copy_and_backup_file(MERGED_EDGE_FILE, '../Analysis') # the backup file will be used for further analysis |