From c689d44bfae6cf9803fde7105f0efdaf405963c5 Mon Sep 17 00:00:00 2001
From: Hui Lan 
Date: Thu, 26 Mar 2020 18:24:53 +0800
Subject: html_network.py: remove many lines to make it simpler
---
 Code/html_network.py | 248 +++++++--------------------------------------------
 1 file changed, 34 insertions(+), 214 deletions(-)
diff --git a/Code/html_network.py b/Code/html_network.py
index 3237c55..e4f1dd3 100644
--- a/Code/html_network.py
+++ b/Code/html_network.py
@@ -33,8 +33,6 @@ RNA_SEQ_INFO_HTML_PAGE  = 'rnaseqinfo.html'
 GENE_ID_TO_GENE_NAME    = '../Data/information/AGI-to-gene-names_v2.txt'
 CHIP_SEQ_INFO_HTML_PAGE = 'chipseqinfo.html'
 
-RAKE_STOPLIST_FILE      = '../Data/information/SmartStoplist.txt'
-
 JSON_DIR                = '../Data/history/expr/json'   # move this directory to the same place as this file html_network.py, for gene expression scatterplot
 JSON_DIR2               = '../Data/history/bind/json2'  # for displaying binding plots
 C3_DIR                  = './depend/c3'
@@ -42,135 +40,6 @@ W2UI_DIR                = './depend/w2ui'
 C3_FILES                = ['c3.min.css', 'c3.min.js', 'd3.min.js', 'scatterplot.js', 'barchart.js'] # for displaying scatterplots and binding strength
 W2UI_FILES              = ['jquery.min.for.w2ui.js', 'w2ui.min.js', 'w2ui.min.css']
 ALPHA                   = 0.6 # weight indicating the importance of number of RNA-seq experiments
-## function definitions
-
-### RAKE rapid automatic keyphrase extraction (NOT USED).  Skip it and jump to my function.
-
-def is_number(s):
-    try:
-        float(s) if '.' in s else int(s)
-        return True
-    except ValueError:
-        return False
-
-
-def load_stop_words(stop_word_file):
-    """
-    Utility function to load stop words from a file and return as a list of words
-    @param stop_word_file Path and file name of a file containing stop words.
-    @return list A list of stop words.
-    """
-    stop_words = []
-    for line in open(stop_word_file):
-        if line.strip()[0:1] != "#":
-            for word in line.split():  # in case more than one per line
-                stop_words.append(word)
-    return stop_words
-
-
-def separate_words(text, min_word_return_size):
-    """
-    Utility function to return a list of all words that are have a length greater than a specified number of characters.
-    @param text The text that must be split in to words.
-    @param min_word_return_size The minimum no of characters a word must have to be included.
-    """
-    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
-    words = []
-    for single_word in splitter.split(text):
-        current_word = single_word.strip().lower()
-        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
-        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
-            words.append(current_word)
-    return words
-
-
-def split_sentences(text):
-    """
-    Utility function to return a list of sentences.
-    @param text The text that must be split in to sentences.
-    """
-    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
-    sentences = sentence_delimiters.split(text)
-    return sentences
-
-
-def build_stop_word_regex(stop_word_file_path):
-    stop_word_list = load_stop_words(stop_word_file_path)
-    stop_word_regex_list = []
-    for word in stop_word_list:
-        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
-        stop_word_regex_list.append(word_regex)
-    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
-    return stop_word_pattern
-
-
-def generate_candidate_keywords(sentence_list, stopword_pattern):
-    phrase_list = []
-    for s in sentence_list:
-        tmp = re.sub(stopword_pattern, '|', s.strip())
-        phrases = tmp.split("|")
-        for phrase in phrases:
-            phrase = phrase.strip().lower()
-            if phrase != "":
-                phrase_list.append(phrase)
-    return phrase_list
-
-
-def calculate_word_scores(phraseList):
-    word_frequency = {}
-    word_degree = {}
-    for phrase in phraseList:
-        word_list = separate_words(phrase, 0)
-        word_list_length = len(word_list)
-        word_list_degree = word_list_length - 1
-        #if word_list_degree > 3: word_list_degree = 3 #exp.
-        for word in word_list:
-            word_frequency.setdefault(word, 0)
-            word_frequency[word] += 1
-            word_degree.setdefault(word, 0)
-            word_degree[word] += word_list_degree  #orig.
-            #word_degree[word] += 1/(word_list_length*1.0) #exp.
-    for item in word_frequency:
-        word_degree[item] = word_degree[item] + word_frequency[item]
-
-    # Calculate Word scores = deg(w)/frew(w)
-    word_score = {}
-    for item in word_frequency:
-        word_score.setdefault(item, 0)
-        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
-    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
-    return word_score
-
-
-def generate_candidate_keyword_scores(phrase_list, word_score):
-    keyword_candidates = {}
-    for phrase in phrase_list:
-        keyword_candidates.setdefault(phrase, 0)
-        word_list = separate_words(phrase, 0)
-        candidate_score = 0
-        for word in word_list:
-            candidate_score += word_score[word]
-        keyword_candidates[phrase] = candidate_score
-    return keyword_candidates
-
-
-class Rake(object):
-    def __init__(self, stop_words_path):
-        self.stop_words_path = stop_words_path
-        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
-
-    def run(self, text):
-        sentence_list = split_sentences(text)
-
-        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
-
-        word_scores = calculate_word_scores(phrase_list)
-
-        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
-
-        sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
-        return sorted_keywords
-
 
 ### my functions
 
@@ -207,8 +76,6 @@ def show_path(G, lst, options):
     print('')
 
 
-def k_shortest_paths(G, source, target, k, weight=None):
-    return list(islice(nx.shortest_simple_paths(G, source, target, weight=weight), k))
 
 def not_bad_line(s):
     if s.strip() == '':
@@ -227,6 +94,7 @@ def not_bad_line(s):
         return False
     return True
 
+
 def build_network_from_file(fname):
     ''' build the network from the big edge file, edges.txt. '''
     MG = nx.MultiDiGraph(max_rsubset_size=1400) # maximum size of conditionR list
@@ -273,16 +141,10 @@ def build_network_from_file(fname):
             score = float(lst[2]) # strength of various kinds of relationship.
             
             # Not sure why I distinguished 'all' and 'mix', as the add_edge statements are the same.
-            if edge_type == 'all':
-                if score > 0:
-                    MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
-                elif score < 0:
-                    MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
-            if edge_type == 'mix':
-                if score > 0:
-                    MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
-                elif score < 0:
-                    MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+            if score > 0:
+                MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+            elif score < 0:
+                MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
 
     f.close()
 
@@ -296,6 +158,7 @@ def get_value(s, delimit):
     lst = s.split(delimit, 1) # split by the first delimit
     return lst[1].strip()
 
+
 def text_to_dict(fname, ignore_first_line=True):
     ''' fname is RNA_SEQ_INFO_DATABASE (see above).  '''
     if not os.path.exists(fname):
@@ -424,12 +287,13 @@ def get_chip_signal(s, d):
     lst = s.split()
     result = ''
     for x in lst:
-        desc = d[x]['DESCRIPTION']
-        lst2 = desc.split('\t')
-        for y in lst2:
-            if y.startswith('SIGNAL='):
-                result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL='
-                break
+        if x in d:
+            desc = d[x]['DESCRIPTION']
+            lst2 = desc.split('\t')
+            for y in lst2:
+                if y.startswith('SIGNAL='):
+                    result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL='
+                    break
     return word_freq(result)
 
 
@@ -438,12 +302,13 @@ def get_chip_phenotype(s, d):
     lst = s.split()
     result = ''
     for x in lst:
-        desc = d[x]['DESCRIPTION']
-        lst2 = desc.split('\t')
-        for y in lst2:
-            if y.startswith('PHENOTYPE='):
-                result += ';' + y[10:]  # 10 means after the '=' in 'PHENOTYPE='
-                break
+        if x in d:
+            desc = d[x]['DESCRIPTION']
+            lst2 = desc.split('\t')
+            for y in lst2:
+                if y.startswith('PHENOTYPE='):
+                    result += ';' + y[10:]  # 10 means after the '=' in 'PHENOTYPE='
+                    break
     return word_freq(result)
 
 
@@ -514,47 +379,6 @@ def word_freq3(lst): # for RNA-seq data, bag-of-words model
     return ' '.join(first_items)
 
 
-def get_rna_signal(s, d):
-    ''' extract RNA-seq signal information, and return the words ordered by frequency '''    
-    lst = s.split()
-    result = []
-    MAX_WORDS = 60
-    if lst[0] == '.': # all RNA samples
-        return 'all available signals'
-    for x in lst: # x is an RNA sample ID, words by frequency
-        if x in d:
-            desc = d[x]['description']
-            desc_lst = re.split('
', desc)
-            short_lst = []
-            for x in desc_lst:
-                short_lst.extend(x.split())
-                if len(short_lst) > MAX_WORDS: # average english words 5.1, take the first 100 words, should be informative enough. Longer desc require more computation time.
-                    short_lst = short_lst[:MAX_WORDS]
-                    break
-            # index = desc.find('
')
-            # if index > 0:
-            #     desc = desc[:index]
-            result.append((' '.join(short_lst)).strip())
-    return word_freq3(result)
-
-
-def get_rna_signal2(s, d): # not very successful, and slow, so NOT used
-    ''' extract RNA-seq signal information, and return the words ordered by frequency '''    
-
-    lst = s.split()
-
-    if lst[0] == '.': # all RNA samples
-        return 'all available signals'
-
-    text = ''
-    for x in lst: # x is an RNA sample ID, words by frequency
-        if x in d:
-            desc = d[x]['description']
-            text += desc.strip().rstrip('.') + '. '
-
-    rake = Rake(RAKE_STOPLIST_FILE)
-    keywords = rake.run(text)
-    return '
'.join( [ t[0] + ' (' + str(int(t[1])) + ')' for t in keywords ] )
 
 
 def replace_old_html_page(fname, edge_date):
@@ -607,16 +431,6 @@ def make_html_page_for_condition(fname, tf_name, target_name, condRstr, condCstr
     s = 'Click for gene expression scatter-plot
  ' % (gene1_file, gene2_file, rnaseq_info_file, cond_lst_str)
     f.write(s)
 
-    global glb_rna_seq_info_dict
-    #s = get_rna_signal(condRstr, glb_rna_seq_info_dict) # DISABLED since this is SLOWEST part
-    # if s.startswith('all available'):
-    #     f.write('Signal
' + '' + s + '
')
-    # else:
-    #     f.write('Signal
 Note: words are ordered by frequency.
' + '' + s + '
')
-    
-    # f.write('%s
' % (make_link_string_for_cond(condRstr, 'rnaseq')))
-    
-    ### ChIP-seq
     f.write('
ChIP-seq experiments
')
     gene1_file = os.path.join('json2', id_lst[0] + '.json') # TF
     gene2_file = os.path.join('json2', id_lst[1] + '.json' ) # target
@@ -720,6 +534,7 @@ def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict
         download_str)
     
     result = start_part + grid1 + grid2 + end_part
+
     # minify html 
     lst = re.split(r'\s{2,}', result)
     result = ''.join(lst)
@@ -730,14 +545,15 @@ def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict
 
 def make_html_page(node, G, fname, agi2name_dict):
     ''' Make html pages for node's successors and predecessors.  '''
-    #f.write('Go to index page
' % ('../summary.html'))
-    #download_str = 'Download all edges' % ('./edges.txt.zip') add in future
+
     download_str = ''
     gname = get_name(node, agi2name_dict)
+    # gene_str include both gene id and gene name (if possible)
     if node.strip() == gname.strip(): # id only
         gene_str = node
     else:
         gene_str = '%s' % (node + ' ' + gname)
+
     N = G.graph['max_rsubset_size']
     
     predecessors = G.predecessors(node)
@@ -745,6 +561,7 @@ def make_html_page(node, G, fname, agi2name_dict):
 
     d1 = {}
     d2 = {}
+
     for n in successors:
         name = n.split()[0] + '.html'
         d = G.get_edge_data(node, n) # n is node's target
@@ -760,7 +577,7 @@ def make_html_page(node, G, fname, agi2name_dict):
             info_page_path = os.path.join(os.path.dirname(fname), info_page)
             tf_name = get_name(node, agi2name_dict)
             target_name = get_name(n, agi2name_dict)
-            make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset)  # ***
+            #make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset)  # ***
 
             d1[info_page] = float(d[k]['metric'])
             display_name = n + ' ' + ('' if target_name == n else target_name)
@@ -802,7 +619,7 @@ def make_html_page(node, G, fname, agi2name_dict):
             tf_name = get_name(n, agi2name_dict)
             target_name = get_name(node, agi2name_dict)            
             #if not os.path.exists(info_page_path):  # tf->target may already exits, if so don't need to make it again
-            make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset)  # CHANGE ***
+            #make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset)  # CHANGE ***
 
             d1[info_page]  = float(d[k]['metric'])
             display_name = n + ' ' + ('' if tf_name == n else tf_name)
@@ -874,11 +691,12 @@ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 s = 'All genes considered
'
 s +=  'Last updated at %s. A total of %d edges.
' % (curr_time, total_num_edges)
 for n in sorted(G.nodes()): # for each node in the network, find its neighbours.
+    print('[html_network.py] Check %s' % (n))
     t = n.split()[0] + '.html'
     filepath = os.path.join(DIR_NAME, t)
 
-    successors = G.successors(n)
-    predecessors = G.predecessors(n)
+    successors = list(G.successors(n))
+    predecessors = list(G.predecessors(n))
 
     s1 = ''
     for sn in successors:
@@ -897,7 +715,8 @@ for n in sorted(G.nodes()): # for each node in the network, find its neighbours.
     s += ' | %s | %s | 
' % (s2, s1)
     s += ''
     s += '
'
-
+    s = 'Not implemented.
' # don't want full-fledged summary.html
+    
     make_html_page(n, G, filepath, agi2name_dict)
 
 findex.write(s)
@@ -905,6 +724,7 @@ findex.write('