From 0cd86c11a40086548dfc438977c674bb2eebca0e Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Fri, 14 Aug 2020 23:24:00 +0800 Subject: html_network.py: make URL for edge information such as AT3G09600_AT1G49720_0.html correct. --- Code/start_webapp.py | 562 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 562 insertions(+) create mode 100755 Code/start_webapp.py (limited to 'Code/start_webapp.py') diff --git a/Code/start_webapp.py b/Code/start_webapp.py new file mode 100755 index 0000000..8891384 --- /dev/null +++ b/Code/start_webapp.py @@ -0,0 +1,562 @@ +# Brain's web app +# Created in December 2016, slcu, hui +# +# Last modified 29 June 2017 [made function get_successors, udpated function get_predecessors] +# Last modified 3 July 2017 [added method option, so users can input 'AT1G65480 leaf' to get neighours based on RNA-seq samples from leaf tissue. See get_method()] +# Last modified 25 Jan 2018 [Note that if the 8th field (lst[8] in build_network_from_file()) is all 0.0. Edges will be missing. Search workaround in this file to 'fix' it. The 8th field represents the confidence of the edge.] +# Last modified 1 Aug 2019. Can I share global variables G and SOURCE_NODES between requests? Failed to do that. The current solution is save two pickle files under the static folder. + +import networkx as nx +from networkx.readwrite import json_graph +import json +import util_networkx +import os +import re +import random +import numpy as np +import operator +import math +from datetime import datetime +import time +#import pickle +import _pickle as cPickle +import random +###################################### +BRAIN_APP_PATH = '/var/www/brain/brain/' # the place of the application folder +EDGE_FILE = '../Data/temp/edges.txt' # [change] +EDGE_DB = BRAIN_APP_PATH + 'static/edges/edges.sqlite' +TF_CATEGORY_FILE = '../Data/information/2D_table2.xls' # put TFs into four categories: TI, TN, GI, GN. +SHOW_N_NODE_EACH_TIME = 50 +TOP_N_NEIGHBOURS = 30 +STRENGTH_TAU = 0.1 # don't show edges with low strength +ALPHA = 0.6 +###################################### + + +def scale_factor(n, N, LL, alpha): + ''' + n -- number of RNA-seq experiments + N -- maximum number of RNA-seq experiments + LL -- total log likelihood + alpha -- weight for number of RNA-seq experiments + ''' + #print('DEBUG n=%d N=%d LL=%g' % (n,N,LL)) + return alpha * (1.0*n/N) + (1-alpha)*(np.e**(1.0*min(LL,0)/n))**0.1 + +def get_method(s): + s = s.lower() + recognised_methods = ['root', 'bud', 'meristem', 'aerial', 'leaf', 'stem', 'seedling', 'seed', 'shoot', 'flower', 'mixreg', 'all', 'hclust.group', 'hclust.fixed.group', 'wedge', 'mixtool'] + for x in recognised_methods: + if x in s: + return x + return '' + +def contain_whole_word(s, t): + ''' whole word s is in t. ''' + lst = t.split(',') # t is sth like: 'shoot, all, seedling, hclust, seed, root' + lst.append('') # handle the case when s is '', to make it true + return s in lst + +def get_successors(G, gene, method): + ''' Only select successors with good relationship strength with gene ''' + node = gene + successors = [] + for n in G.successors(node): + d = G.get_edge_data(node, n) + max_strength = 0 + for k in d.keys(): # select a best edge + if d[k]['strength'] > max_strength and contain_whole_word(method, d[k]['method']): + max_strength = d[k]['strength'] + if max_strength > STRENGTH_TAU: + successors.append(n) + return successors + +def get_predecessors(G, gene, num, method): + ''' Get top num predecessors of gene ''' + + node = gene + predecessors = [] + # don't include edges with small strength + for n in G.predecessors(node): + d = G.get_edge_data(n, node) + max_strength = 0 + for k in d.keys(): # select a best edge + #print('DEBUG: %s %4.2f %4.2f' % (d[k]['link'], d[k]['weight'], d[k]['strength'] ) ) + if d[k]['strength'] > max_strength and contain_whole_word(method, d[k]['method']): + max_strength = d[k]['strength'] + if max_strength > STRENGTH_TAU: + predecessors.append(n) + + #print('DEBUG: len(predecessors)=%d num=%d' % (len(predecessors), num)) + if len(predecessors) < num: + return predecessors + + d1 = {} + for n in predecessors: + d = G.get_edge_data(n, node) + max_strength = 0 + for k in d.keys(): # select a best edge + if d[k]['strength'] > max_strength: + max_strength = d[k]['strength'] + if n == 'AT3G50410' or n == 'AT1G52150': + print('WARNING: %s should not be here. Method:%s.' % (n, method)) + d1[n] = max_strength + + sorted_lst = sorted(d1.items(), key=operator.itemgetter(1), reverse=True) + a, b = zip(*sorted_lst) + return np.random.choice(a, num, replace=False, p=1.0 * np.array(b)/sum(b)) # choose with probability + + +def get_max_rsubset_size(fname): + ''' get the max number of RNA-seq experiment ''' + max_rsize = 0 + f = open(fname) + for line in f: + line = line.strip() + lst = line.split('\t') + condR_lst = [] + if line != '' and len(lst) > 3: + if len(lst) > 6: + condR_lst = lst[4].split() + size_condR = len(condR_lst) + if size_condR > max_rsize: + max_rsize = size_condR + f.close() + return max_rsize + +def build_network_from_file(edge_fname, tf_category_fname): + + EDGE_DIR = 'static/edges' + tf_category_dict = get_tf_category_dict(tf_category_fname) + + ## build the network + N = get_max_rsubset_size(edge_fname) + G = nx.MultiDiGraph(max_rsubset_size=N) + + source_nodes = [] + d = {} # count dict + f = open(edge_fname) + for line in f: + line = line.strip() + lst = line.split('\t') + if line != '' and random.randint(1,10) < 10: + strength = max(abs(float(lst[2])), float(lst[8])) # prevent strength being zero + #strength = abs(float(lst[2])) # a workaround since lst[8] is now zero as time elapse, check compute_metric() in update_network.py for details + method_or_tissue = lst[9] + g1 = lst[0].split()[0] # target gene ID + g1_label = lst[0].split()[1].split(';')[0] if lst[0].split()[1] != '.' else g1 + g1_name = lst[0].split()[1] if lst[0].split()[1] != '.' else '' + g2 = lst[1].split()[0] # source gene ID + g2_label = lst[1].split()[1].split(';')[0] if lst[1].split()[1] != '.' else g2 + g2_name = lst[1].split()[1] if lst[1].split()[1] != '.' else '' + g1_link = '/' + EDGE_DIR.rstrip('/') + '/' + g1 + '.html' + g2_link = '/' + EDGE_DIR.rstrip('/') + '/' + g2 + '.html' + g1_category = tf_category_dict[g1] if g1 in tf_category_dict else [-9.0, 'N'] # -9.0 means not tissue-specific, and N means not inducible + g1_TG_score = g1_category[0] + g1_TG_category = TorGorN(g1_TG_score, 2.40) # T/G, 2.40 is selected to satisfy 'GI > TI and TN > GN' + g1_IN_category = g1_category[1] # I/N + g2_category = tf_category_dict[g2] if g2 in tf_category_dict else [-9.0, 'N'] + g2_TG_score = g2_category[0] + g2_TG_category = TorGorN(g2_TG_score, 2.40) # 'T' if g2_TG_score >= 2.40 else 'G' + g2_IN_category = g2_category[1] + G.add_node(g1, full_name=g1_name, label=g1_label, link=g1_link, istf='0', tf_category=g1_TG_category+g1_IN_category, tf_TG_score=g1_TG_score, tf_IN_category=g1_IN_category) # if g1 is also a TF, then istf='0' will overwrite it in the following for loop + G.add_node(g2, full_name=g2_name, label=g2_label, link=g2_link, istf='1', tf_category=g2_TG_category+g2_IN_category, tf_TG_score=g2_TG_score, tf_IN_category=g2_IN_category) # tf_category contains default TF category code. It can be modified later given user's input + + if not g2+g1 in d: + d[g2+g1] = 0 + else: + d[g2+g1] += 1 + + link_html_name = '/brain/edges/' + '%s_%s_%d.html' % (g2, g1, d[g2+g1]) + + G.add_edge(g2, g1, weight=float(lst[2]), link=link_html_name, strength=strength, method=method_or_tissue) # g2 is source, and g1 is target + + source_nodes.append(g2) + + f.close() + + source_nodes = list(set(source_nodes)) + for n in source_nodes: + G.node[n]['istf'] = '1' # force source nodes to be TFs + + return G, source_nodes + + +def is_valid_query_gene(gid, G): + if not re.match(r'AT\dG\d+', gid): + return 'Invalid gene ID. Example of a valid gene ID: AT1G80870.' + if not gid in G.nodes(): + return 'Not in graph.' + return '' + +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + +def validate_chip_peak_file(fname, protein_id, protein_name, status, description): + protein_id = protein_id.strip().upper() + if protein_id == 'E.G., AT0G12345' or not protein_id.startswith('AT') or protein_id.find('G') != 3 or len(protein_id) != 9: + return 'Invalide protein ID: %s.' % (protein_id) + protein_name = protein_name.strip() + description = description.strip() + status = status.strip().upper() + if status == 'UNSPECIFIED': + return 'Please select ChIP-seq experiment status.' + + f = open(fname) + lines = f.readlines() + for line in lines: + line = line.strip() + lst = line.split('\t') + if len(lst) < 10: # check we have at least 10 columns + return 'The line %s should have at least 10 columns (see BED6+4 format).' % line + + if not lst[0].isdigit(): + c = lst[0].lower() + if (c.startswith('chr') and c[3:].isdigit()) or (c in ['mt', 'pt', 'ct']): + pass + else: + return 'First column must be a chromosome number. [line: %s].' % line + + if not lst[1].isdigit(): + return 'Second column (starting position) must be a number. [line: %s].' % line + if not lst[2].isdigit(): + return 'Third column (end position) must be a number. [line: %s].' % line + if not lst[4].isdigit(): + return 'Fourth column (binding strength) must be a number. [line: %s].' % line + + f.close() + return '' + +def edit_chip_peak_file(fname, protein_id, protein_name, status, description): + protein_id = protein_id.strip().upper() + protein_name = protein_name.strip() + description = description.strip() + status = status.strip().upper() + + f = open(fname) + lines = f.readlines() + f.close() + + f = open(fname, 'w') + f.write('# PROTEIN_ID:%s\n' % protein_id) + f.write('# PROTEIN_NAME:%s\n' % protein_name.upper()) + f.write('# STATUS:%s\n' % status.upper()) + f.write('# DESCRIPTION:%s\n' % ' '.join([x.strip() for x in description.split('\n')])) + f.write(''.join(lines)) + f.close() + +# Taken from https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + + +def validate_and_edit_rnaseq_file(fname, genes): + protein_id = genes.strip().upper() + protein_id_lst = protein_id.split() + for x in protein_id_lst: + if not x.startswith('AT') or x.find('G') != 3 or len(x) != 9: + return 'Invalide gene ID in %s: %s.' % (fname, x) + + fn = os.path.basename(fname)[16:] + f = open(fname) + lines = f.readlines() + head_line = lines[0].strip() + head_line_lst = head_line.split() + if len(head_line_lst) > 2: + return 'Invalid head line in %s: %s. The head line must contain one or two sample IDs.' % (fn, head_line) + lineno = 1 + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + x = lst[0] # gene id + if not x.startswith('AT') or x.find('G') != 3 or len(x) != 9: + return 'Invalide gene ID in %s: %s at line %d.' % (fn, x, lineno) + for x in lst[1:]: + if not is_number(x): + return 'Invalid number %s in %s.' % (x, fn) + lineno += 1 + f.close() + return '' + +def write_gene_list(genes_str, directory, fname): + lst = genes_str.strip().split() + keep_lst = [] + for x in lst: + g = x.strip() + if g != '' and g != 'AT0G12345': + keep_lst.append(g) + + f = open(os.path.join(directory, fname), 'w') + if len(keep_lst) > 0: + f.write('\n'.join(keep_lst)) + else: + f.write('AT0G12345\n') # ghost gene + f.close() + +def write_log_file(fname, s): + # in the future, if we see concurrency problems, use the python logging module. + f = open(fname, 'a') + curr_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + f.write('[%s] %s\n' % (curr_time, s)) + f.close() + +def number_of_columns(fname): + ''' Return number of transcriptome columns in fname ''' + f = open(fname) + lines = f.readlines() + f.close() + if len(lines) < 2: + return 0 + second_line = lines[1] + lst = second_line.split('\t') + return len(lst) - 1 # number of transcriptome columns + +def map_to_transcriptome_space(fname): + + nc = number_of_columns(fname) + if nc == 1: + cmd = 'Rscript map_to_transcriptome_space.R %s' % (fname) + os.system(cmd) + elif nc == 2: + cmd = 'Rscript map_to_transcriptome_space2.R %s' % (fname) + os.system(cmd) + else: + return {} # an empty dictionary + + head = os.path.basename(fname)[:16] + static_temp = './static/temp/' + if not os.path.isdir(static_temp): # make the directory if not exists + os.makedirs(static_temp) + + f = open(static_temp + head + 'result.txt') + s = f.read() + f.close() + + if nc == 1: + return {'result': s, 'heatmap': head + 'heatmap.png', 'edges': head + 'edges.txt'} + if nc == 2: + return {'head': head, 'result': s, 'heatmap': head + 'heatmap.png', 'edges': head + 'edges.txt', 'diff_expr_genes': head + 'diff_expr_genes.txt'} + + +def peak_to_target_genes(peak_file, protein_id): + ''' Convert bed file to target gene list. Also make two files, date_time_targets.txt and date_time_tfs.txt. ''' + + static_temp = './static/temp' + if not os.path.isdir(static_temp): # make the directory if not exists + os.makedirs(static_temp) + + static_info = './static/info' + info_bed_file = os.path.join(static_info, 'genesTAIR10.bed') + + fname_head = os.path.splitext(os.path.basename(peak_file))[0] + fname_head = fname_head[:16] # get date and time info + #fname_head = 'test' + bedmap_result = fname_head + 'bedmap_result.txt' + bedmap_result = os.path.join(static_temp, bedmap_result) + + cmd = 'bedmap --echo --echo-map-id-uniq --delim \'\t\' ' \ + + '--range 3000' \ + + ' ' + peak_file \ + + ' ' + info_bed_file + ' > %s' % (bedmap_result) + os.system(cmd) + print(cmd) + #cmd = 'cp ./static/temp/20170919_100450_bedmap_result.txt %s' % (bedmap_result) # [to be deleted] + #os.system(cmd) # [to be deleted] + + f = open(bedmap_result) + lines = f.readlines() + d = {} + for line in lines: + line = line.strip() + lst = line.split('\t') + strength = lst[6] + if len(lst) >= 11: + genes = lst[10].replace('"','') # remove double-quotes + for g in genes.split(';'): + d[g] = strength + f.close() + + tf_file = os.path.join(static_temp, fname_head + 'tfs.txt') + f = open(tf_file, 'w') + s = 'gene_id\tbinding_strength\n%s\t0.0\n' % (protein_id) + f.write(s) + f.close() + + target_file = os.path.join(static_temp, fname_head + 'targets.txt') + s = 'gene_id\tbinding_strength\n' + for k in sorted(d.keys()): + s += '%s\t%s\n' % (k, d[k]) + f = open(target_file, 'w') + f.write(s) + f.close() + tissue_specific_edge_file = os.path.join(static_temp, fname_head + 'tissue_specific_edges.txt') # generated by edge_per_tissue.R + result_dict = {'targets':target_file, 'tissue_specific_edges':tissue_specific_edge_file} + return result_dict + +def get_go_enrichment(fname): + # make a list of functional targets (from the tissue specific edge file) + d = {} + f = open(fname) + lines = f.readlines() + f.close() + for line in lines[1:]: # skip the head line + line = line.strip() + lst = line.split('\t') + target = (lst[0].split())[0] # gene id only, no gene name + if not target.startswith('NA'): + d[target] = 1 + temp_fname = fname + '.temp' + f = open(temp_fname, 'w') + f.write('\n'.join(sorted(d.keys())) + '\n') + f.close() + # call goatools + head = os.path.basename(fname)[0:16] + go_enrichment_file = './static/temp/' + head + 'go_enrichment.txt' + cmd = 'python /home/hui/software/goatools/goatools/scripts/find_enrichment.py %s /home/hui/software/goatools/goatools/data/population /home/hui/software/goatools/goatools/data/association --obo /home/hui/software/goatools/goatools/scripts/go-basic.obo > %s' % (temp_fname, go_enrichment_file) + os.system(cmd) + return go_enrichment_file + +def get_tf_category(fname): + ''' Return a dictionary where key is gene id and value is TI/TN/GI/GN. ''' + d = {} + f = open(fname) + lines = f.readlines() + f.close() + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + gene = lst[0] + category = lst[3] + if not gene in d: + d[gene] = category + return d + +def weighted_score_for_tissue(a, b): + a = a + b = b # demoninator + r = a / b + scale = math.log(b+1, 2) # scale factor + return scale * r + +def weighted_score_for_treatment(a, b): + if a == 0: # avoid returning 1 in the case of 0/0 + return 0 + a = a + 1 + b = b + 1 + return a/b # avoid dividing by zeros + +def TorG(x, tau): + ''' Phil's version ''' + if (x >= tau): + return ('T', x) + else: + return ('G', x) + +def TorGorN(x, tau): + if (x >= tau): + return 'T' + elif x >= 0: + return 'G' + else: + return 'N' + +def IorN(lst): + ''' lst -- a list of tuples of the form [('cold', 2), ('heat', 2), ...]''' + L = [x[1] for x in lst] + maxL = max(L) + if maxL >= 1: + return ('I', maxL) + else: + return ('N', maxL) + +def get_tf_category_dict(fname): + ''' fname is 2D_table2.xls, whose third column tissue.specific.score specifies the strength of being tissue-speficif. ''' + f = open(fname) + lines = f.readlines() + f.close() + d = {} # key is TF and value is category code TI/GI/TN/GN + + for line in lines[1:]: # skip the first line + line = line.strip() + lst = line.split('\t') + tissue_score_lst = [] + tname = ['flower', 'leaf', 'meristem', 'root', 'seed', 'seedling', 'silique', 'stem'] # tissue names in fname + j = 0 + for x in lst[5:13]: # tissue column starts from column 6 + y = x.split('\\') + t = (tname[j], weighted_score_for_tissue(int(y[0]), int(y[1]))) + j += 1 + tissue_score_lst.append(t) + #torg = TorG(float(lst[2]), tissue_specific_tau) # tissue_specific_tau = 1.9 would make SEP3 a T + torg_score = float(lst[2]) + + treat_score_lst = [] + tname = ['cold', 'heat', 'wounding', 'drought'] + j = 0 + for x in lst[14:]: + y = x.split('\\') + t = (tname[j], weighted_score_for_treatment(int(y[0]), int(y[1]))) + j += 1 + treat_score_lst.append(t) + iorn = IorN(treat_score_lst) + + tf_id = lst[0] + if not tf_id in d: + d[tf_id] = [torg_score, iorn[0]] + + return d + + +def get_tissue_specific_tau(s): + s = s.upper() + index = s.find('TISSUE SPECIFIC TAU') + if index < 0: + return -9000.0 + s2 = s[index+19:] + for x in s2.split( ): + if is_number(x): + return float(x) + return -9001.0 + + +def get_user_option(s): + ''' Return a dictionary containing user's input''' + d = {} + s = s.upper() + lst = s.split() + accepted_options = ['TISSUE SPECIFIC TAU', 'JUST TFS'] + for x in accepted_options: + if x in s and x == 'TISSUE SPECIFIC TAU': + d[x] = get_tissue_specific_tau(s) + elif x in s: + d[x] = 1 + return d + + +def load_pickle_data(fname): + with open(fname, 'rb') as f: + return pickle.load(f) + + +def num_line(fname): + ''' Return number of lines in file fname. ''' + if not os.path.exists(fname): + return 0 + f = open(fname) + lines = f.readlines() + f.close() + return len(lines) + +if __name__ == "__main__": + print('Call build_network_from_file ...') + G, SOURCE_NODES = build_network_from_file(EDGE_FILE, TF_CATEGORY_FILE) # [change] use networkx binary file later + with open('../Data/temp/SOURCE_NODES.json', 'w') as p: + json.dump(SOURCE_NODES, p) + #with open('../Data/temp/G.json', 'w') as p: + # json.dump(json_graph.node_link_data(G), p) + nx.write_gpickle(G, '../Data/temp/G.gpickle') + print('Network updated and tf category loaded.') -- cgit v1.2.1