From c689d44bfae6cf9803fde7105f0efdaf405963c5 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Thu, 26 Mar 2020 18:24:53 +0800 Subject: html_network.py: remove many lines to make it simpler --- Code/html_network.py | 248 +++++++-------------------------------------------- 1 file changed, 34 insertions(+), 214 deletions(-) (limited to 'Code/html_network.py') diff --git a/Code/html_network.py b/Code/html_network.py index 3237c55..e4f1dd3 100644 --- a/Code/html_network.py +++ b/Code/html_network.py @@ -33,8 +33,6 @@ RNA_SEQ_INFO_HTML_PAGE = 'rnaseqinfo.html' GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' CHIP_SEQ_INFO_HTML_PAGE = 'chipseqinfo.html' -RAKE_STOPLIST_FILE = '../Data/information/SmartStoplist.txt' - JSON_DIR = '../Data/history/expr/json' # move this directory to the same place as this file html_network.py, for gene expression scatterplot JSON_DIR2 = '../Data/history/bind/json2' # for displaying binding plots C3_DIR = './depend/c3' @@ -42,135 +40,6 @@ W2UI_DIR = './depend/w2ui' C3_FILES = ['c3.min.css', 'c3.min.js', 'd3.min.js', 'scatterplot.js', 'barchart.js'] # for displaying scatterplots and binding strength W2UI_FILES = ['jquery.min.for.w2ui.js', 'w2ui.min.js', 'w2ui.min.css'] ALPHA = 0.6 # weight indicating the importance of number of RNA-seq experiments -## function definitions - -### RAKE rapid automatic keyphrase extraction (NOT USED). Skip it and jump to my function. - -def is_number(s): - try: - float(s) if '.' in s else int(s) - return True - except ValueError: - return False - - -def load_stop_words(stop_word_file): - """ - Utility function to load stop words from a file and return as a list of words - @param stop_word_file Path and file name of a file containing stop words. - @return list A list of stop words. - """ - stop_words = [] - for line in open(stop_word_file): - if line.strip()[0:1] != "#": - for word in line.split(): # in case more than one per line - stop_words.append(word) - return stop_words - - -def separate_words(text, min_word_return_size): - """ - Utility function to return a list of all words that are have a length greater than a specified number of characters. - @param text The text that must be split in to words. - @param min_word_return_size The minimum no of characters a word must have to be included. - """ - splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') - words = [] - for single_word in splitter.split(text): - current_word = single_word.strip().lower() - #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases - if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word): - words.append(current_word) - return words - - -def split_sentences(text): - """ - Utility function to return a list of sentences. - @param text The text that must be split in to sentences. - """ - sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s') - sentences = sentence_delimiters.split(text) - return sentences - - -def build_stop_word_regex(stop_word_file_path): - stop_word_list = load_stop_words(stop_word_file_path) - stop_word_regex_list = [] - for word in stop_word_list: - word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen - stop_word_regex_list.append(word_regex) - stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) - return stop_word_pattern - - -def generate_candidate_keywords(sentence_list, stopword_pattern): - phrase_list = [] - for s in sentence_list: - tmp = re.sub(stopword_pattern, '|', s.strip()) - phrases = tmp.split("|") - for phrase in phrases: - phrase = phrase.strip().lower() - if phrase != "": - phrase_list.append(phrase) - return phrase_list - - -def calculate_word_scores(phraseList): - word_frequency = {} - word_degree = {} - for phrase in phraseList: - word_list = separate_words(phrase, 0) - word_list_length = len(word_list) - word_list_degree = word_list_length - 1 - #if word_list_degree > 3: word_list_degree = 3 #exp. - for word in word_list: - word_frequency.setdefault(word, 0) - word_frequency[word] += 1 - word_degree.setdefault(word, 0) - word_degree[word] += word_list_degree #orig. - #word_degree[word] += 1/(word_list_length*1.0) #exp. - for item in word_frequency: - word_degree[item] = word_degree[item] + word_frequency[item] - - # Calculate Word scores = deg(w)/frew(w) - word_score = {} - for item in word_frequency: - word_score.setdefault(item, 0) - word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig. - #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp. - return word_score - - -def generate_candidate_keyword_scores(phrase_list, word_score): - keyword_candidates = {} - for phrase in phrase_list: - keyword_candidates.setdefault(phrase, 0) - word_list = separate_words(phrase, 0) - candidate_score = 0 - for word in word_list: - candidate_score += word_score[word] - keyword_candidates[phrase] = candidate_score - return keyword_candidates - - -class Rake(object): - def __init__(self, stop_words_path): - self.stop_words_path = stop_words_path - self.__stop_words_pattern = build_stop_word_regex(stop_words_path) - - def run(self, text): - sentence_list = split_sentences(text) - - phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern) - - word_scores = calculate_word_scores(phrase_list) - - keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores) - - sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) - return sorted_keywords - ### my functions @@ -207,8 +76,6 @@ def show_path(G, lst, options): print('') -def k_shortest_paths(G, source, target, k, weight=None): - return list(islice(nx.shortest_simple_paths(G, source, target, weight=weight), k)) def not_bad_line(s): if s.strip() == '': @@ -227,6 +94,7 @@ def not_bad_line(s): return False return True + def build_network_from_file(fname): ''' build the network from the big edge file, edges.txt. ''' MG = nx.MultiDiGraph(max_rsubset_size=1400) # maximum size of conditionR list @@ -273,16 +141,10 @@ def build_network_from_file(fname): score = float(lst[2]) # strength of various kinds of relationship. # Not sure why I distinguished 'all' and 'mix', as the add_edge statements are the same. - if edge_type == 'all': - if score > 0: - MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method) - elif score < 0: - MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method) - if edge_type == 'mix': - if score > 0: - MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method) - elif score < 0: - MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method) + if score > 0: + MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method) + elif score < 0: + MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method) f.close() @@ -296,6 +158,7 @@ def get_value(s, delimit): lst = s.split(delimit, 1) # split by the first delimit return lst[1].strip() + def text_to_dict(fname, ignore_first_line=True): ''' fname is RNA_SEQ_INFO_DATABASE (see above). ''' if not os.path.exists(fname): @@ -424,12 +287,13 @@ def get_chip_signal(s, d): lst = s.split() result = '' for x in lst: - desc = d[x]['DESCRIPTION'] - lst2 = desc.split('\t') - for y in lst2: - if y.startswith('SIGNAL='): - result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL=' - break + if x in d: + desc = d[x]['DESCRIPTION'] + lst2 = desc.split('\t') + for y in lst2: + if y.startswith('SIGNAL='): + result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL=' + break return word_freq(result) @@ -438,12 +302,13 @@ def get_chip_phenotype(s, d): lst = s.split() result = '' for x in lst: - desc = d[x]['DESCRIPTION'] - lst2 = desc.split('\t') - for y in lst2: - if y.startswith('PHENOTYPE='): - result += ';' + y[10:] # 10 means after the '=' in 'PHENOTYPE=' - break + if x in d: + desc = d[x]['DESCRIPTION'] + lst2 = desc.split('\t') + for y in lst2: + if y.startswith('PHENOTYPE='): + result += ';' + y[10:] # 10 means after the '=' in 'PHENOTYPE=' + break return word_freq(result) @@ -514,47 +379,6 @@ def word_freq3(lst): # for RNA-seq data, bag-of-words model return ' '.join(first_items) -def get_rna_signal(s, d): - ''' extract RNA-seq signal information, and return the words ordered by frequency ''' - lst = s.split() - result = [] - MAX_WORDS = 60 - if lst[0] == '.': # all RNA samples - return 'all available signals' - for x in lst: # x is an RNA sample ID, words by frequency - if x in d: - desc = d[x]['description'] - desc_lst = re.split('
', desc) - short_lst = [] - for x in desc_lst: - short_lst.extend(x.split()) - if len(short_lst) > MAX_WORDS: # average english words 5.1, take the first 100 words, should be informative enough. Longer desc require more computation time. - short_lst = short_lst[:MAX_WORDS] - break - # index = desc.find('
') - # if index > 0: - # desc = desc[:index] - result.append((' '.join(short_lst)).strip()) - return word_freq3(result) - - -def get_rna_signal2(s, d): # not very successful, and slow, so NOT used - ''' extract RNA-seq signal information, and return the words ordered by frequency ''' - - lst = s.split() - - if lst[0] == '.': # all RNA samples - return 'all available signals' - - text = '' - for x in lst: # x is an RNA sample ID, words by frequency - if x in d: - desc = d[x]['description'] - text += desc.strip().rstrip('.') + '. ' - - rake = Rake(RAKE_STOPLIST_FILE) - keywords = rake.run(text) - return '
'.join( [ t[0] + ' (' + str(int(t[1])) + ')' for t in keywords ] ) def replace_old_html_page(fname, edge_date): @@ -607,16 +431,6 @@ def make_html_page_for_condition(fname, tf_name, target_name, condRstr, condCstr s = '

Click for gene expression scatter-plot

' % (gene1_file, gene2_file, rnaseq_info_file, cond_lst_str) f.write(s) - global glb_rna_seq_info_dict - #s = get_rna_signal(condRstr, glb_rna_seq_info_dict) # DISABLED since this is SLOWEST part - # if s.startswith('all available'): - # f.write('

Signal

' + '

' + s + '

') - # else: - # f.write('

Signal

Note: words are ordered by frequency.

' + '

' + s + '

') - - # f.write('

%s

' % (make_link_string_for_cond(condRstr, 'rnaseq'))) - - ### ChIP-seq f.write('

ChIP-seq experiments

') gene1_file = os.path.join('json2', id_lst[0] + '.json') # TF gene2_file = os.path.join('json2', id_lst[1] + '.json' ) # target @@ -720,6 +534,7 @@ def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict download_str) result = start_part + grid1 + grid2 + end_part + # minify html lst = re.split(r'\s{2,}', result) result = ''.join(lst) @@ -730,14 +545,15 @@ def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict def make_html_page(node, G, fname, agi2name_dict): ''' Make html pages for node's successors and predecessors. ''' - #f.write('

Go to index page

' % ('../summary.html')) - #download_str = 'Download all edges' % ('./edges.txt.zip') add in future + download_str = '' gname = get_name(node, agi2name_dict) + # gene_str include both gene id and gene name (if possible) if node.strip() == gname.strip(): # id only gene_str = node else: gene_str = '%s' % (node + ' ' + gname) + N = G.graph['max_rsubset_size'] predecessors = G.predecessors(node) @@ -745,6 +561,7 @@ def make_html_page(node, G, fname, agi2name_dict): d1 = {} d2 = {} + for n in successors: name = n.split()[0] + '.html' d = G.get_edge_data(node, n) # n is node's target @@ -760,7 +577,7 @@ def make_html_page(node, G, fname, agi2name_dict): info_page_path = os.path.join(os.path.dirname(fname), info_page) tf_name = get_name(node, agi2name_dict) target_name = get_name(n, agi2name_dict) - make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # *** + #make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # *** d1[info_page] = float(d[k]['metric']) display_name = n + ' ' + ('' if target_name == n else target_name) @@ -802,7 +619,7 @@ def make_html_page(node, G, fname, agi2name_dict): tf_name = get_name(n, agi2name_dict) target_name = get_name(node, agi2name_dict) #if not os.path.exists(info_page_path): # tf->target may already exits, if so don't need to make it again - make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # CHANGE *** + #make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # CHANGE *** d1[info_page] = float(d[k]['metric']) display_name = n + ' ' + ('' if tf_name == n else tf_name) @@ -874,11 +691,12 @@ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') s = '

All genes considered

' s += '

Last updated at %s. A total of %d edges.

' % (curr_time, total_num_edges) for n in sorted(G.nodes()): # for each node in the network, find its neighbours. + print('[html_network.py] Check %s' % (n)) t = n.split()[0] + '.html' filepath = os.path.join(DIR_NAME, t) - successors = G.successors(n) - predecessors = G.predecessors(n) + successors = list(G.successors(n)) + predecessors = list(G.predecessors(n)) s1 = '' for sn in successors: @@ -897,7 +715,8 @@ for n in sorted(G.nodes()): # for each node in the network, find its neighbours. s += ' %s %s' % (s2, s1) s += '' s += '

' - + s = '

Not implemented.

' # don't want full-fledged summary.html + make_html_page(n, G, filepath, agi2name_dict) findex.write(s) @@ -905,6 +724,7 @@ findex.write('') findex.close() # copy auxiliary folders and files +print('[html_network.py] Copy auxiliary folders and files.') if os.path.isdir(JSON_DIR): cmd = 'cp -r %s %s' % (JSON_DIR, DIR_NAME) os.system(cmd) @@ -939,4 +759,4 @@ for fname in W2UI_FILES: else: print('[WARNING] html_network.py: Omit %s. Table may not work without this file. ' % (fpath)) -#print('html_network.py done!') +print('[html_network.py] Done!') -- cgit v1.2.1