diff options
Diffstat (limited to 'Code')
-rw-r--r-- | Code/html_network.py | 248 |
1 files changed, 34 insertions, 214 deletions
diff --git a/Code/html_network.py b/Code/html_network.py index 3237c55..e4f1dd3 100644 --- a/Code/html_network.py +++ b/Code/html_network.py @@ -33,8 +33,6 @@ RNA_SEQ_INFO_HTML_PAGE = 'rnaseqinfo.html' GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
CHIP_SEQ_INFO_HTML_PAGE = 'chipseqinfo.html'
-RAKE_STOPLIST_FILE = '../Data/information/SmartStoplist.txt'
-
JSON_DIR = '../Data/history/expr/json' # move this directory to the same place as this file html_network.py, for gene expression scatterplot
JSON_DIR2 = '../Data/history/bind/json2' # for displaying binding plots
C3_DIR = './depend/c3'
@@ -42,135 +40,6 @@ W2UI_DIR = './depend/w2ui' C3_FILES = ['c3.min.css', 'c3.min.js', 'd3.min.js', 'scatterplot.js', 'barchart.js'] # for displaying scatterplots and binding strength
W2UI_FILES = ['jquery.min.for.w2ui.js', 'w2ui.min.js', 'w2ui.min.css']
ALPHA = 0.6 # weight indicating the importance of number of RNA-seq experiments
-## function definitions
-
-### RAKE rapid automatic keyphrase extraction (NOT USED). Skip it and jump to my function.
-
-def is_number(s):
- try:
- float(s) if '.' in s else int(s)
- return True
- except ValueError:
- return False
-
-
-def load_stop_words(stop_word_file):
- """
- Utility function to load stop words from a file and return as a list of words
- @param stop_word_file Path and file name of a file containing stop words.
- @return list A list of stop words.
- """
- stop_words = []
- for line in open(stop_word_file):
- if line.strip()[0:1] != "#":
- for word in line.split(): # in case more than one per line
- stop_words.append(word)
- return stop_words
-
-
-def separate_words(text, min_word_return_size):
- """
- Utility function to return a list of all words that are have a length greater than a specified number of characters.
- @param text The text that must be split in to words.
- @param min_word_return_size The minimum no of characters a word must have to be included.
- """
- splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
- words = []
- for single_word in splitter.split(text):
- current_word = single_word.strip().lower()
- #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
- if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
- words.append(current_word)
- return words
-
-
-def split_sentences(text):
- """
- Utility function to return a list of sentences.
- @param text The text that must be split in to sentences.
- """
- sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
- sentences = sentence_delimiters.split(text)
- return sentences
-
-
-def build_stop_word_regex(stop_word_file_path):
- stop_word_list = load_stop_words(stop_word_file_path)
- stop_word_regex_list = []
- for word in stop_word_list:
- word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen
- stop_word_regex_list.append(word_regex)
- stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
- return stop_word_pattern
-
-
-def generate_candidate_keywords(sentence_list, stopword_pattern):
- phrase_list = []
- for s in sentence_list:
- tmp = re.sub(stopword_pattern, '|', s.strip())
- phrases = tmp.split("|")
- for phrase in phrases:
- phrase = phrase.strip().lower()
- if phrase != "":
- phrase_list.append(phrase)
- return phrase_list
-
-
-def calculate_word_scores(phraseList):
- word_frequency = {}
- word_degree = {}
- for phrase in phraseList:
- word_list = separate_words(phrase, 0)
- word_list_length = len(word_list)
- word_list_degree = word_list_length - 1
- #if word_list_degree > 3: word_list_degree = 3 #exp.
- for word in word_list:
- word_frequency.setdefault(word, 0)
- word_frequency[word] += 1
- word_degree.setdefault(word, 0)
- word_degree[word] += word_list_degree #orig.
- #word_degree[word] += 1/(word_list_length*1.0) #exp.
- for item in word_frequency:
- word_degree[item] = word_degree[item] + word_frequency[item]
-
- # Calculate Word scores = deg(w)/frew(w)
- word_score = {}
- for item in word_frequency:
- word_score.setdefault(item, 0)
- word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.
- #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
- return word_score
-
-
-def generate_candidate_keyword_scores(phrase_list, word_score):
- keyword_candidates = {}
- for phrase in phrase_list:
- keyword_candidates.setdefault(phrase, 0)
- word_list = separate_words(phrase, 0)
- candidate_score = 0
- for word in word_list:
- candidate_score += word_score[word]
- keyword_candidates[phrase] = candidate_score
- return keyword_candidates
-
-
-class Rake(object):
- def __init__(self, stop_words_path):
- self.stop_words_path = stop_words_path
- self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
-
- def run(self, text):
- sentence_list = split_sentences(text)
-
- phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
-
- word_scores = calculate_word_scores(phrase_list)
-
- keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
-
- sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
- return sorted_keywords
-
### my functions
@@ -207,8 +76,6 @@ def show_path(G, lst, options): print('')
-def k_shortest_paths(G, source, target, k, weight=None):
- return list(islice(nx.shortest_simple_paths(G, source, target, weight=weight), k))
def not_bad_line(s):
if s.strip() == '':
@@ -227,6 +94,7 @@ def not_bad_line(s): return False
return True
+
def build_network_from_file(fname):
''' build the network from the big edge file, edges.txt. '''
MG = nx.MultiDiGraph(max_rsubset_size=1400) # maximum size of conditionR list
@@ -273,16 +141,10 @@ def build_network_from_file(fname): score = float(lst[2]) # strength of various kinds of relationship.
# Not sure why I distinguished 'all' and 'mix', as the add_edge statements are the same.
- if edge_type == 'all':
- if score > 0:
- MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
- elif score < 0:
- MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
- if edge_type == 'mix':
- if score > 0:
- MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
- elif score < 0:
- MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+ if score > 0:
+ MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+ elif score < 0:
+ MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
f.close()
@@ -296,6 +158,7 @@ def get_value(s, delimit): lst = s.split(delimit, 1) # split by the first delimit
return lst[1].strip()
+
def text_to_dict(fname, ignore_first_line=True):
''' fname is RNA_SEQ_INFO_DATABASE (see above). '''
if not os.path.exists(fname):
@@ -424,12 +287,13 @@ def get_chip_signal(s, d): lst = s.split()
result = ''
for x in lst:
- desc = d[x]['DESCRIPTION']
- lst2 = desc.split('\t')
- for y in lst2:
- if y.startswith('SIGNAL='):
- result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL='
- break
+ if x in d:
+ desc = d[x]['DESCRIPTION']
+ lst2 = desc.split('\t')
+ for y in lst2:
+ if y.startswith('SIGNAL='):
+ result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL='
+ break
return word_freq(result)
@@ -438,12 +302,13 @@ def get_chip_phenotype(s, d): lst = s.split()
result = ''
for x in lst:
- desc = d[x]['DESCRIPTION']
- lst2 = desc.split('\t')
- for y in lst2:
- if y.startswith('PHENOTYPE='):
- result += ';' + y[10:] # 10 means after the '=' in 'PHENOTYPE='
- break
+ if x in d:
+ desc = d[x]['DESCRIPTION']
+ lst2 = desc.split('\t')
+ for y in lst2:
+ if y.startswith('PHENOTYPE='):
+ result += ';' + y[10:] # 10 means after the '=' in 'PHENOTYPE='
+ break
return word_freq(result)
@@ -514,47 +379,6 @@ def word_freq3(lst): # for RNA-seq data, bag-of-words model return ' '.join(first_items)
-def get_rna_signal(s, d):
- ''' extract RNA-seq signal information, and return the words ordered by frequency '''
- lst = s.split()
- result = []
- MAX_WORDS = 60
- if lst[0] == '.': # all RNA samples
- return 'all available signals'
- for x in lst: # x is an RNA sample ID, words by frequency
- if x in d:
- desc = d[x]['description']
- desc_lst = re.split('<br>', desc)
- short_lst = []
- for x in desc_lst:
- short_lst.extend(x.split())
- if len(short_lst) > MAX_WORDS: # average english words 5.1, take the first 100 words, should be informative enough. Longer desc require more computation time.
- short_lst = short_lst[:MAX_WORDS]
- break
- # index = desc.find('<br>')
- # if index > 0:
- # desc = desc[:index]
- result.append((' '.join(short_lst)).strip())
- return word_freq3(result)
-
-
-def get_rna_signal2(s, d): # not very successful, and slow, so NOT used
- ''' extract RNA-seq signal information, and return the words ordered by frequency '''
-
- lst = s.split()
-
- if lst[0] == '.': # all RNA samples
- return 'all available signals'
-
- text = ''
- for x in lst: # x is an RNA sample ID, words by frequency
- if x in d:
- desc = d[x]['description']
- text += desc.strip().rstrip('.') + '. '
-
- rake = Rake(RAKE_STOPLIST_FILE)
- keywords = rake.run(text)
- return '<br>'.join( [ t[0] + ' (' + str(int(t[1])) + ')' for t in keywords ] )
def replace_old_html_page(fname, edge_date):
@@ -607,16 +431,6 @@ def make_html_page_for_condition(fname, tf_name, target_name, condRstr, condCstr s = '<p><a id=\"myLink\" href=\"javascript:void(0);\" onclick=\"drawScatterPlot(\'%s\',\'%s\', \'%s\', %s);\">Click for gene expression scatter-plot</a></p> <p id=\"chart\"></p>' % (gene1_file, gene2_file, rnaseq_info_file, cond_lst_str)
f.write(s)
- global glb_rna_seq_info_dict
- #s = get_rna_signal(condRstr, glb_rna_seq_info_dict) # DISABLED since this is SLOWEST part
- # if s.startswith('all available'):
- # f.write('<h3>Signal</h3>' + '<p>' + s + '</p>')
- # else:
- # f.write('<h3>Signal</h3> <p>Note: words are ordered by frequency.</p>' + '<p>' + s + '</p>')
-
- # f.write('<p>%s<p>' % (make_link_string_for_cond(condRstr, 'rnaseq')))
-
- ### ChIP-seq
f.write('<h2>ChIP-seq experiments</h2>')
gene1_file = os.path.join('json2', id_lst[0] + '.json') # TF
gene2_file = os.path.join('json2', id_lst[1] + '.json' ) # target
@@ -720,6 +534,7 @@ def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict download_str)
result = start_part + grid1 + grid2 + end_part
+
# minify html
lst = re.split(r'\s{2,}', result)
result = ''.join(lst)
@@ -730,14 +545,15 @@ def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict def make_html_page(node, G, fname, agi2name_dict):
''' Make html pages for node's successors and predecessors. '''
- #f.write('<p><a href=%s>Go to index page</a></p>' % ('../summary.html'))
- #download_str = '<a href=\'%s\'>Download all edges</a>' % ('./edges.txt.zip') add in future
+
download_str = ''
gname = get_name(node, agi2name_dict)
+ # gene_str include both gene id and gene name (if possible)
if node.strip() == gname.strip(): # id only
gene_str = node
else:
gene_str = '%s' % (node + ' ' + gname)
+
N = G.graph['max_rsubset_size']
predecessors = G.predecessors(node)
@@ -745,6 +561,7 @@ def make_html_page(node, G, fname, agi2name_dict): d1 = {}
d2 = {}
+
for n in successors:
name = n.split()[0] + '.html'
d = G.get_edge_data(node, n) # n is node's target
@@ -760,7 +577,7 @@ def make_html_page(node, G, fname, agi2name_dict): info_page_path = os.path.join(os.path.dirname(fname), info_page)
tf_name = get_name(node, agi2name_dict)
target_name = get_name(n, agi2name_dict)
- make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # ***
+ #make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # ***
d1[info_page] = float(d[k]['metric'])
display_name = n + ' ' + ('' if target_name == n else target_name)
@@ -802,7 +619,7 @@ def make_html_page(node, G, fname, agi2name_dict): tf_name = get_name(n, agi2name_dict)
target_name = get_name(node, agi2name_dict)
#if not os.path.exists(info_page_path): # tf->target may already exits, if so don't need to make it again
- make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # CHANGE ***
+ #make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # CHANGE ***
d1[info_page] = float(d[k]['metric'])
display_name = n + ' ' + ('' if tf_name == n else tf_name)
@@ -874,11 +691,12 @@ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') s = '<h2>All genes considered</h2>'
s += '<p>Last updated at %s. A total of %d edges.</p>' % (curr_time, total_num_edges)
for n in sorted(G.nodes()): # for each node in the network, find its neighbours.
+ print('[html_network.py] Check %s' % (n))
t = n.split()[0] + '.html'
filepath = os.path.join(DIR_NAME, t)
- successors = G.successors(n)
- predecessors = G.predecessors(n)
+ successors = list(G.successors(n))
+ predecessors = list(G.predecessors(n))
s1 = ''
for sn in successors:
@@ -897,7 +715,8 @@ for n in sorted(G.nodes()): # for each node in the network, find its neighbours. s += '<tr> <td valign=\"top\">%s</td> <td valign=\"top\">%s</td></tr>' % (s2, s1)
s += '</table>'
s += '</p>'
-
+ s = '<p>Not implemented.</p>' # don't want full-fledged summary.html
+
make_html_page(n, G, filepath, agi2name_dict)
findex.write(s)
@@ -905,6 +724,7 @@ findex.write('</body></html>') findex.close()
# copy auxiliary folders and files
+print('[html_network.py] Copy auxiliary folders and files.')
if os.path.isdir(JSON_DIR):
cmd = 'cp -r %s %s' % (JSON_DIR, DIR_NAME)
os.system(cmd)
@@ -939,4 +759,4 @@ for fname in W2UI_FILES: else:
print('[WARNING] html_network.py: Omit %s. Table may not work without this file. ' % (fpath))
-#print('html_network.py done!')
+print('[html_network.py] Done!')
|