From 408d73d7bd6977deddd4378dd1ecbaba9c6fa460 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Fri, 14 Aug 2020 23:26:45 +0800 Subject: start_webapp.py and util_networkx.py: remove them (they are in /var/www/brain/brain). --- Code/start_webapp.py | 562 -------------------------------------------------- Code/util_networkx.py | 151 -------------- 2 files changed, 713 deletions(-) delete mode 100755 Code/start_webapp.py delete mode 100755 Code/util_networkx.py diff --git a/Code/start_webapp.py b/Code/start_webapp.py deleted file mode 100755 index 8891384..0000000 --- a/Code/start_webapp.py +++ /dev/null @@ -1,562 +0,0 @@ -# Brain's web app -# Created in December 2016, slcu, hui -# -# Last modified 29 June 2017 [made function get_successors, udpated function get_predecessors] -# Last modified 3 July 2017 [added method option, so users can input 'AT1G65480 leaf' to get neighours based on RNA-seq samples from leaf tissue. See get_method()] -# Last modified 25 Jan 2018 [Note that if the 8th field (lst[8] in build_network_from_file()) is all 0.0. Edges will be missing. Search workaround in this file to 'fix' it. The 8th field represents the confidence of the edge.] -# Last modified 1 Aug 2019. Can I share global variables G and SOURCE_NODES between requests? Failed to do that. The current solution is save two pickle files under the static folder. - -import networkx as nx -from networkx.readwrite import json_graph -import json -import util_networkx -import os -import re -import random -import numpy as np -import operator -import math -from datetime import datetime -import time -#import pickle -import _pickle as cPickle -import random -###################################### -BRAIN_APP_PATH = '/var/www/brain/brain/' # the place of the application folder -EDGE_FILE = '../Data/temp/edges.txt' # [change] -EDGE_DB = BRAIN_APP_PATH + 'static/edges/edges.sqlite' -TF_CATEGORY_FILE = '../Data/information/2D_table2.xls' # put TFs into four categories: TI, TN, GI, GN. -SHOW_N_NODE_EACH_TIME = 50 -TOP_N_NEIGHBOURS = 30 -STRENGTH_TAU = 0.1 # don't show edges with low strength -ALPHA = 0.6 -###################################### - - -def scale_factor(n, N, LL, alpha): - ''' - n -- number of RNA-seq experiments - N -- maximum number of RNA-seq experiments - LL -- total log likelihood - alpha -- weight for number of RNA-seq experiments - ''' - #print('DEBUG n=%d N=%d LL=%g' % (n,N,LL)) - return alpha * (1.0*n/N) + (1-alpha)*(np.e**(1.0*min(LL,0)/n))**0.1 - -def get_method(s): - s = s.lower() - recognised_methods = ['root', 'bud', 'meristem', 'aerial', 'leaf', 'stem', 'seedling', 'seed', 'shoot', 'flower', 'mixreg', 'all', 'hclust.group', 'hclust.fixed.group', 'wedge', 'mixtool'] - for x in recognised_methods: - if x in s: - return x - return '' - -def contain_whole_word(s, t): - ''' whole word s is in t. ''' - lst = t.split(',') # t is sth like: 'shoot, all, seedling, hclust, seed, root' - lst.append('') # handle the case when s is '', to make it true - return s in lst - -def get_successors(G, gene, method): - ''' Only select successors with good relationship strength with gene ''' - node = gene - successors = [] - for n in G.successors(node): - d = G.get_edge_data(node, n) - max_strength = 0 - for k in d.keys(): # select a best edge - if d[k]['strength'] > max_strength and contain_whole_word(method, d[k]['method']): - max_strength = d[k]['strength'] - if max_strength > STRENGTH_TAU: - successors.append(n) - return successors - -def get_predecessors(G, gene, num, method): - ''' Get top num predecessors of gene ''' - - node = gene - predecessors = [] - # don't include edges with small strength - for n in G.predecessors(node): - d = G.get_edge_data(n, node) - max_strength = 0 - for k in d.keys(): # select a best edge - #print('DEBUG: %s %4.2f %4.2f' % (d[k]['link'], d[k]['weight'], d[k]['strength'] ) ) - if d[k]['strength'] > max_strength and contain_whole_word(method, d[k]['method']): - max_strength = d[k]['strength'] - if max_strength > STRENGTH_TAU: - predecessors.append(n) - - #print('DEBUG: len(predecessors)=%d num=%d' % (len(predecessors), num)) - if len(predecessors) < num: - return predecessors - - d1 = {} - for n in predecessors: - d = G.get_edge_data(n, node) - max_strength = 0 - for k in d.keys(): # select a best edge - if d[k]['strength'] > max_strength: - max_strength = d[k]['strength'] - if n == 'AT3G50410' or n == 'AT1G52150': - print('WARNING: %s should not be here. Method:%s.' % (n, method)) - d1[n] = max_strength - - sorted_lst = sorted(d1.items(), key=operator.itemgetter(1), reverse=True) - a, b = zip(*sorted_lst) - return np.random.choice(a, num, replace=False, p=1.0 * np.array(b)/sum(b)) # choose with probability - - -def get_max_rsubset_size(fname): - ''' get the max number of RNA-seq experiment ''' - max_rsize = 0 - f = open(fname) - for line in f: - line = line.strip() - lst = line.split('\t') - condR_lst = [] - if line != '' and len(lst) > 3: - if len(lst) > 6: - condR_lst = lst[4].split() - size_condR = len(condR_lst) - if size_condR > max_rsize: - max_rsize = size_condR - f.close() - return max_rsize - -def build_network_from_file(edge_fname, tf_category_fname): - - EDGE_DIR = 'static/edges' - tf_category_dict = get_tf_category_dict(tf_category_fname) - - ## build the network - N = get_max_rsubset_size(edge_fname) - G = nx.MultiDiGraph(max_rsubset_size=N) - - source_nodes = [] - d = {} # count dict - f = open(edge_fname) - for line in f: - line = line.strip() - lst = line.split('\t') - if line != '' and random.randint(1,10) < 10: - strength = max(abs(float(lst[2])), float(lst[8])) # prevent strength being zero - #strength = abs(float(lst[2])) # a workaround since lst[8] is now zero as time elapse, check compute_metric() in update_network.py for details - method_or_tissue = lst[9] - g1 = lst[0].split()[0] # target gene ID - g1_label = lst[0].split()[1].split(';')[0] if lst[0].split()[1] != '.' else g1 - g1_name = lst[0].split()[1] if lst[0].split()[1] != '.' else '' - g2 = lst[1].split()[0] # source gene ID - g2_label = lst[1].split()[1].split(';')[0] if lst[1].split()[1] != '.' else g2 - g2_name = lst[1].split()[1] if lst[1].split()[1] != '.' else '' - g1_link = '/' + EDGE_DIR.rstrip('/') + '/' + g1 + '.html' - g2_link = '/' + EDGE_DIR.rstrip('/') + '/' + g2 + '.html' - g1_category = tf_category_dict[g1] if g1 in tf_category_dict else [-9.0, 'N'] # -9.0 means not tissue-specific, and N means not inducible - g1_TG_score = g1_category[0] - g1_TG_category = TorGorN(g1_TG_score, 2.40) # T/G, 2.40 is selected to satisfy 'GI > TI and TN > GN' - g1_IN_category = g1_category[1] # I/N - g2_category = tf_category_dict[g2] if g2 in tf_category_dict else [-9.0, 'N'] - g2_TG_score = g2_category[0] - g2_TG_category = TorGorN(g2_TG_score, 2.40) # 'T' if g2_TG_score >= 2.40 else 'G' - g2_IN_category = g2_category[1] - G.add_node(g1, full_name=g1_name, label=g1_label, link=g1_link, istf='0', tf_category=g1_TG_category+g1_IN_category, tf_TG_score=g1_TG_score, tf_IN_category=g1_IN_category) # if g1 is also a TF, then istf='0' will overwrite it in the following for loop - G.add_node(g2, full_name=g2_name, label=g2_label, link=g2_link, istf='1', tf_category=g2_TG_category+g2_IN_category, tf_TG_score=g2_TG_score, tf_IN_category=g2_IN_category) # tf_category contains default TF category code. It can be modified later given user's input - - if not g2+g1 in d: - d[g2+g1] = 0 - else: - d[g2+g1] += 1 - - link_html_name = '/brain/edges/' + '%s_%s_%d.html' % (g2, g1, d[g2+g1]) - - G.add_edge(g2, g1, weight=float(lst[2]), link=link_html_name, strength=strength, method=method_or_tissue) # g2 is source, and g1 is target - - source_nodes.append(g2) - - f.close() - - source_nodes = list(set(source_nodes)) - for n in source_nodes: - G.node[n]['istf'] = '1' # force source nodes to be TFs - - return G, source_nodes - - -def is_valid_query_gene(gid, G): - if not re.match(r'AT\dG\d+', gid): - return 'Invalid gene ID. Example of a valid gene ID: AT1G80870.' - if not gid in G.nodes(): - return 'Not in graph.' - return '' - -def allowed_file(filename): - return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS - -def validate_chip_peak_file(fname, protein_id, protein_name, status, description): - protein_id = protein_id.strip().upper() - if protein_id == 'E.G., AT0G12345' or not protein_id.startswith('AT') or protein_id.find('G') != 3 or len(protein_id) != 9: - return 'Invalide protein ID: %s.' % (protein_id) - protein_name = protein_name.strip() - description = description.strip() - status = status.strip().upper() - if status == 'UNSPECIFIED': - return 'Please select ChIP-seq experiment status.' - - f = open(fname) - lines = f.readlines() - for line in lines: - line = line.strip() - lst = line.split('\t') - if len(lst) < 10: # check we have at least 10 columns - return 'The line %s should have at least 10 columns (see BED6+4 format).' % line - - if not lst[0].isdigit(): - c = lst[0].lower() - if (c.startswith('chr') and c[3:].isdigit()) or (c in ['mt', 'pt', 'ct']): - pass - else: - return 'First column must be a chromosome number. [line: %s].' % line - - if not lst[1].isdigit(): - return 'Second column (starting position) must be a number. [line: %s].' % line - if not lst[2].isdigit(): - return 'Third column (end position) must be a number. [line: %s].' % line - if not lst[4].isdigit(): - return 'Fourth column (binding strength) must be a number. [line: %s].' % line - - f.close() - return '' - -def edit_chip_peak_file(fname, protein_id, protein_name, status, description): - protein_id = protein_id.strip().upper() - protein_name = protein_name.strip() - description = description.strip() - status = status.strip().upper() - - f = open(fname) - lines = f.readlines() - f.close() - - f = open(fname, 'w') - f.write('# PROTEIN_ID:%s\n' % protein_id) - f.write('# PROTEIN_NAME:%s\n' % protein_name.upper()) - f.write('# STATUS:%s\n' % status.upper()) - f.write('# DESCRIPTION:%s\n' % ' '.join([x.strip() for x in description.split('\n')])) - f.write(''.join(lines)) - f.close() - -# Taken from https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - - -def validate_and_edit_rnaseq_file(fname, genes): - protein_id = genes.strip().upper() - protein_id_lst = protein_id.split() - for x in protein_id_lst: - if not x.startswith('AT') or x.find('G') != 3 or len(x) != 9: - return 'Invalide gene ID in %s: %s.' % (fname, x) - - fn = os.path.basename(fname)[16:] - f = open(fname) - lines = f.readlines() - head_line = lines[0].strip() - head_line_lst = head_line.split() - if len(head_line_lst) > 2: - return 'Invalid head line in %s: %s. The head line must contain one or two sample IDs.' % (fn, head_line) - lineno = 1 - for line in lines[1:]: - line = line.strip() - lst = line.split('\t') - x = lst[0] # gene id - if not x.startswith('AT') or x.find('G') != 3 or len(x) != 9: - return 'Invalide gene ID in %s: %s at line %d.' % (fn, x, lineno) - for x in lst[1:]: - if not is_number(x): - return 'Invalid number %s in %s.' % (x, fn) - lineno += 1 - f.close() - return '' - -def write_gene_list(genes_str, directory, fname): - lst = genes_str.strip().split() - keep_lst = [] - for x in lst: - g = x.strip() - if g != '' and g != 'AT0G12345': - keep_lst.append(g) - - f = open(os.path.join(directory, fname), 'w') - if len(keep_lst) > 0: - f.write('\n'.join(keep_lst)) - else: - f.write('AT0G12345\n') # ghost gene - f.close() - -def write_log_file(fname, s): - # in the future, if we see concurrency problems, use the python logging module. - f = open(fname, 'a') - curr_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - f.write('[%s] %s\n' % (curr_time, s)) - f.close() - -def number_of_columns(fname): - ''' Return number of transcriptome columns in fname ''' - f = open(fname) - lines = f.readlines() - f.close() - if len(lines) < 2: - return 0 - second_line = lines[1] - lst = second_line.split('\t') - return len(lst) - 1 # number of transcriptome columns - -def map_to_transcriptome_space(fname): - - nc = number_of_columns(fname) - if nc == 1: - cmd = 'Rscript map_to_transcriptome_space.R %s' % (fname) - os.system(cmd) - elif nc == 2: - cmd = 'Rscript map_to_transcriptome_space2.R %s' % (fname) - os.system(cmd) - else: - return {} # an empty dictionary - - head = os.path.basename(fname)[:16] - static_temp = './static/temp/' - if not os.path.isdir(static_temp): # make the directory if not exists - os.makedirs(static_temp) - - f = open(static_temp + head + 'result.txt') - s = f.read() - f.close() - - if nc == 1: - return {'result': s, 'heatmap': head + 'heatmap.png', 'edges': head + 'edges.txt'} - if nc == 2: - return {'head': head, 'result': s, 'heatmap': head + 'heatmap.png', 'edges': head + 'edges.txt', 'diff_expr_genes': head + 'diff_expr_genes.txt'} - - -def peak_to_target_genes(peak_file, protein_id): - ''' Convert bed file to target gene list. Also make two files, date_time_targets.txt and date_time_tfs.txt. ''' - - static_temp = './static/temp' - if not os.path.isdir(static_temp): # make the directory if not exists - os.makedirs(static_temp) - - static_info = './static/info' - info_bed_file = os.path.join(static_info, 'genesTAIR10.bed') - - fname_head = os.path.splitext(os.path.basename(peak_file))[0] - fname_head = fname_head[:16] # get date and time info - #fname_head = 'test' - bedmap_result = fname_head + 'bedmap_result.txt' - bedmap_result = os.path.join(static_temp, bedmap_result) - - cmd = 'bedmap --echo --echo-map-id-uniq --delim \'\t\' ' \ - + '--range 3000' \ - + ' ' + peak_file \ - + ' ' + info_bed_file + ' > %s' % (bedmap_result) - os.system(cmd) - print(cmd) - #cmd = 'cp ./static/temp/20170919_100450_bedmap_result.txt %s' % (bedmap_result) # [to be deleted] - #os.system(cmd) # [to be deleted] - - f = open(bedmap_result) - lines = f.readlines() - d = {} - for line in lines: - line = line.strip() - lst = line.split('\t') - strength = lst[6] - if len(lst) >= 11: - genes = lst[10].replace('"','') # remove double-quotes - for g in genes.split(';'): - d[g] = strength - f.close() - - tf_file = os.path.join(static_temp, fname_head + 'tfs.txt') - f = open(tf_file, 'w') - s = 'gene_id\tbinding_strength\n%s\t0.0\n' % (protein_id) - f.write(s) - f.close() - - target_file = os.path.join(static_temp, fname_head + 'targets.txt') - s = 'gene_id\tbinding_strength\n' - for k in sorted(d.keys()): - s += '%s\t%s\n' % (k, d[k]) - f = open(target_file, 'w') - f.write(s) - f.close() - tissue_specific_edge_file = os.path.join(static_temp, fname_head + 'tissue_specific_edges.txt') # generated by edge_per_tissue.R - result_dict = {'targets':target_file, 'tissue_specific_edges':tissue_specific_edge_file} - return result_dict - -def get_go_enrichment(fname): - # make a list of functional targets (from the tissue specific edge file) - d = {} - f = open(fname) - lines = f.readlines() - f.close() - for line in lines[1:]: # skip the head line - line = line.strip() - lst = line.split('\t') - target = (lst[0].split())[0] # gene id only, no gene name - if not target.startswith('NA'): - d[target] = 1 - temp_fname = fname + '.temp' - f = open(temp_fname, 'w') - f.write('\n'.join(sorted(d.keys())) + '\n') - f.close() - # call goatools - head = os.path.basename(fname)[0:16] - go_enrichment_file = './static/temp/' + head + 'go_enrichment.txt' - cmd = 'python /home/hui/software/goatools/goatools/scripts/find_enrichment.py %s /home/hui/software/goatools/goatools/data/population /home/hui/software/goatools/goatools/data/association --obo /home/hui/software/goatools/goatools/scripts/go-basic.obo > %s' % (temp_fname, go_enrichment_file) - os.system(cmd) - return go_enrichment_file - -def get_tf_category(fname): - ''' Return a dictionary where key is gene id and value is TI/TN/GI/GN. ''' - d = {} - f = open(fname) - lines = f.readlines() - f.close() - for line in lines[1:]: - line = line.strip() - lst = line.split('\t') - gene = lst[0] - category = lst[3] - if not gene in d: - d[gene] = category - return d - -def weighted_score_for_tissue(a, b): - a = a - b = b # demoninator - r = a / b - scale = math.log(b+1, 2) # scale factor - return scale * r - -def weighted_score_for_treatment(a, b): - if a == 0: # avoid returning 1 in the case of 0/0 - return 0 - a = a + 1 - b = b + 1 - return a/b # avoid dividing by zeros - -def TorG(x, tau): - ''' Phil's version ''' - if (x >= tau): - return ('T', x) - else: - return ('G', x) - -def TorGorN(x, tau): - if (x >= tau): - return 'T' - elif x >= 0: - return 'G' - else: - return 'N' - -def IorN(lst): - ''' lst -- a list of tuples of the form [('cold', 2), ('heat', 2), ...]''' - L = [x[1] for x in lst] - maxL = max(L) - if maxL >= 1: - return ('I', maxL) - else: - return ('N', maxL) - -def get_tf_category_dict(fname): - ''' fname is 2D_table2.xls, whose third column tissue.specific.score specifies the strength of being tissue-speficif. ''' - f = open(fname) - lines = f.readlines() - f.close() - d = {} # key is TF and value is category code TI/GI/TN/GN - - for line in lines[1:]: # skip the first line - line = line.strip() - lst = line.split('\t') - tissue_score_lst = [] - tname = ['flower', 'leaf', 'meristem', 'root', 'seed', 'seedling', 'silique', 'stem'] # tissue names in fname - j = 0 - for x in lst[5:13]: # tissue column starts from column 6 - y = x.split('\\') - t = (tname[j], weighted_score_for_tissue(int(y[0]), int(y[1]))) - j += 1 - tissue_score_lst.append(t) - #torg = TorG(float(lst[2]), tissue_specific_tau) # tissue_specific_tau = 1.9 would make SEP3 a T - torg_score = float(lst[2]) - - treat_score_lst = [] - tname = ['cold', 'heat', 'wounding', 'drought'] - j = 0 - for x in lst[14:]: - y = x.split('\\') - t = (tname[j], weighted_score_for_treatment(int(y[0]), int(y[1]))) - j += 1 - treat_score_lst.append(t) - iorn = IorN(treat_score_lst) - - tf_id = lst[0] - if not tf_id in d: - d[tf_id] = [torg_score, iorn[0]] - - return d - - -def get_tissue_specific_tau(s): - s = s.upper() - index = s.find('TISSUE SPECIFIC TAU') - if index < 0: - return -9000.0 - s2 = s[index+19:] - for x in s2.split( ): - if is_number(x): - return float(x) - return -9001.0 - - -def get_user_option(s): - ''' Return a dictionary containing user's input''' - d = {} - s = s.upper() - lst = s.split() - accepted_options = ['TISSUE SPECIFIC TAU', 'JUST TFS'] - for x in accepted_options: - if x in s and x == 'TISSUE SPECIFIC TAU': - d[x] = get_tissue_specific_tau(s) - elif x in s: - d[x] = 1 - return d - - -def load_pickle_data(fname): - with open(fname, 'rb') as f: - return pickle.load(f) - - -def num_line(fname): - ''' Return number of lines in file fname. ''' - if not os.path.exists(fname): - return 0 - f = open(fname) - lines = f.readlines() - f.close() - return len(lines) - -if __name__ == "__main__": - print('Call build_network_from_file ...') - G, SOURCE_NODES = build_network_from_file(EDGE_FILE, TF_CATEGORY_FILE) # [change] use networkx binary file later - with open('../Data/temp/SOURCE_NODES.json', 'w') as p: - json.dump(SOURCE_NODES, p) - #with open('../Data/temp/G.json', 'w') as p: - # json.dump(json_graph.node_link_data(G), p) - nx.write_gpickle(G, '../Data/temp/G.gpickle') - print('Network updated and tf category loaded.') diff --git a/Code/util_networkx.py b/Code/util_networkx.py deleted file mode 100755 index b708610..0000000 --- a/Code/util_networkx.py +++ /dev/null @@ -1,151 +0,0 @@ -""" -Data conversion utility for NetworkX -===================================== - -Convert cytoscape.js style graphs from/to NetworkX object. - -https://networkx.github.io/ - -""" - -import networkx as nx - -# Special Keys -ID = 'id' -NAME = 'name' -DATA = 'data' -ELEMENTS = 'elements' - -NODES = 'nodes' -EDGES = 'edges' - -SOURCE = 'source' -TARGET = 'target' - -DEF_SCALE = 100 - - -def __map_table_data(columns, graph_obj): - data = {} - for col in columns: - if col == 0: - break - - data[col] = graph_obj[col] - - return data - - -def __create_node(node, node_id): - new_node = {} - node_columns = node.keys() - data = __map_table_data(node_columns, node) - # Override special keys - data[ID] = str(node_id) - data[NAME] = str(node_id) - - if 'position' in node.keys(): - position = node['position'] - new_node['position'] = position - - new_node[DATA] = data - return new_node - - -def __build_multi_edge(edge_tuple, g): - source = edge_tuple[0] - target = edge_tuple[1] - key = edge_tuple[2] - data = edge_tuple[3] - - data['source'] = str(source) - data['target'] = str(target) - data['interaction'] = str(key) - return {DATA: data} - - -def __build_edge(edge_tuple, g): - source = edge_tuple[0] - target = edge_tuple[1] - data = edge_tuple[2] - - data['source'] = str(source) - data['target'] = str(target) - return {DATA: data} - - -def __build_empty_graph(): - return { - DATA: {}, - ELEMENTS: { - NODES: [], - EDGES: [] - } - } - - -def from_networkx(g, layout=None, scale=DEF_SCALE): - # Dictionary Object to be converted to Cytoscape.js JSON - cygraph = __build_empty_graph() - - if layout is not None: - pos = map(lambda position: - {'x': position[0]*scale, 'y': position[1]*scale}, - layout.values()) - - nodes = g.nodes() - if isinstance(g, nx.MultiDiGraph) or isinstance(g, nx.MultiGraph): - edges = g.edges(data=True, keys=True) - edge_builder = __build_multi_edge - else: - edges = g.edges(data=True) - edge_builder = __build_edge - - # Map network table data - cygraph[DATA] = __map_table_data(g.graph.keys(), g.graph) - - for i, node_id in enumerate(nodes): - new_node = __create_node(g.node[node_id], node_id) - if layout is not None: - new_node['position'] = pos[i] - - cygraph['elements']['nodes'].append(new_node) - - for edge in edges: - cygraph['elements']['edges'].append(edge_builder(edge, g)) - - return cygraph['elements'] - - -def to_networkx(cyjs, directed=True): - """ - Convert Cytoscape.js-style JSON object into NetworkX object. - - By default, data will be handles as a directed graph. - """ - - if directed: - g = nx.MultiDiGraph() - else: - g = nx.MultiGraph() - - network_data = cyjs[DATA] - if network_data is not None: - for key in network_data.keys(): - g.graph[key] = network_data[key] - - nodes = cyjs[ELEMENTS][NODES] - edges = cyjs[ELEMENTS][EDGES] - - for node in nodes: - data = node[DATA] - g.add_node(data[ID], attr_dict=data) - - for edge in edges: - data = edge[DATA] - source = data[SOURCE] - target = data[TARGET] - - g.add_edge(source, target, attr_dict=data) - - return g -- cgit v1.2.1