From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/create_edges0.py | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 Code/create_edges0.py (limited to 'Code/create_edges0.py') diff --git a/Code/create_edges0.py b/Code/create_edges0.py new file mode 100644 index 0000000..75cb1e1 --- /dev/null +++ b/Code/create_edges0.py @@ -0,0 +1,215 @@ +# Usage: python create_edges0.py parameter_for_net.txt +# +# Make it faster by spawning subprocesses. +# +# Quickly create edges using all samples in TPM.txt. TF and targets +# are from target_tf.txt. Results will be written to +# ../Data/history/edge_pool/edges.txt.simple.correlation.all.conditions.date +# target_tf.txt is produced by make_target_tf.py. +# +# +# 26 JAN 2017, hui, slcu +# Last modified 5 APR 2017, hui, slcu + +import sys, os, operator, itertools +from datetime import datetime +from geneid2name import make_gene_name_AGI_map_dict, get_gene_name +from param4net import make_global_param_dict + +TARGET_FILE = '../Data/temp/all_targets.txt' +TF_FILE = '../Data/temp/all_tfs.txt' +RESULT_FILE = '../Data/temp/corr_all.txt' +R_SCRIPT_FILE = '../Data/temp/compute_simple_correlation.r' + +HISTORY_DIR = '../Data/history/edge_pool' # edges.txt.* files are here + + +def get_value(s, delimit): + lst = s.split(delimit) + return lst[1].strip() + + +def get_gene_list(fname): + result = [] + f = open(fname) + for line in f: + line = line.strip() + lst = line.split() + result.append(lst[0]) + f.close() + return result + + +def make_tf_dict(fname): + d = {} + f = open(fname) + for line in f: + line = line.strip() + lst = line.split('\t') + target = lst[0] + tf = lst[1] + cond = lst[2].split() + if not target in d: + d[target] = {tf:cond} + else: + d[target][tf] = cond + f.close() + return d + + +def get_targets_and_tfs(fname): + f = open(fname) + target_lst = [] + tf_lst = [] + for line in f: + line = line.strip() + lst = line.split('\t') + target = lst[0] + tf = lst[1] + target_lst.append(target) + tf_lst.append(tf) + f.close() + return sorted(list(set(target_lst))), sorted(list(set(tf_lst))) + + +def write_lst_to_file(lst, fname): + f = open(fname, 'w') + for x in lst: + f.write(x + '\n') + f.close() + + +def make_r_script(fname, result_file, data_file, target_file, tf_file, r_tau=0.75): + + head = 'OUTPUT_FILE <- \'%s\'\n DATA_FILE <- \'%s\'\n TARGET_FILE <- \'%s\'\n TF_FILE <- \'%s\'\n tau <- %0.2f\n' % (result_file, data_file, target_file, tf_file, r_tau) + + body = ''' + targets <- read.table(TARGET_FILE, header=FALSE) + tfs <- read.table(TF_FILE, header=FALSE) + X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE) + targets <- as.vector(targets$V1) + tfs <- as.vector(tfs$V1) + all_genes <- rownames(X) + + X <- as.matrix(X) + sd.1 <- apply(X, 1, sd) # sd of each row + s0 <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row + sd.tau <- (quantile(sd.1,na.rm=TRUE)[1] + quantile(sd.1,na.rm=TRUE)[2]) / 2.0 # min SD + good <- sd.1 > max(sd.tau, 0.05) + tf_good <- which( good & (all_genes %in% tfs) == T ) + target_good <- which( good & (all_genes %in% targets) == T ) + + # compute correlation coefficient + X <- log(X + 1) + X[X<0.01] <- NA + if (length(tf_good) < 2) { + c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good),]), use='pairwise.complete.obs') + } else { + c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs') + } + index <- !is.na(c) & abs(c) >= tau & abs(c) <= 0.99 + row_names <- rownames(c) + col_names <- colnames(c) + result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index]) + + # write results + write.table(result, OUTPUT_FILE, col.names=F, row.names=F, sep='\\t', quote=F) + ''' + + f = open(fname, 'w') + content = head + body + lst = [x.strip() for x in content.split('\n')] + f.write('\n'.join(lst)) + f.close() + + +def edit_headline(fname): + ''' Remove gene_id from first line. For easier R matrix reading. ''' + new_fname = fname + '.copy' + f = open(fname) + lines = f.readlines() + f.close() + f = open(new_fname, 'w') + head = lines[0].strip() + head_lst = head.split('\t')[1:] + num_rnaseq = len(head.split('\t')) - 1 + f.write('\t'.join(head_lst) + '\n') + for line in lines[1:]: + f.write(line) + f.close() + return new_fname, num_rnaseq + + +def establish_edges(corr_fname, target_tf_fname, result_fname, agi2name_dict, num_rnaseq, glb_param_dict): + big_tf_dict = make_tf_dict(target_tf_fname) + f = open(corr_fname) + lines = f.readlines() + f.close() + + result = '' + for line in lines: + line = line.strip() + lst = line.split('\t') + target = lst[0] + tf = lst[1] + score = '%4.2f' % (float(lst[2])) + if target in big_tf_dict and tf in big_tf_dict[target]: + target_str = target + ' ' + get_gene_name(target, agi2name_dict) + tf_str = tf + ' ' + get_gene_name(tf, agi2name_dict) + score_str = score + cond_str = ' '.join(big_tf_dict[target][tf]) + curr_date = datetime.now().strftime('%Y%m%d') + method_or_tissue = 'all' if glb_param_dict['EXPRESSION_MATRIX_DESCRIPTION'].strip() == '' else glb_param_dict['EXPRESSION_MATRIX_DESCRIPTION'] + s = '\t'.join([target_str, tf_str, score_str, 'all', str(num_rnaseq), cond_str, '.', curr_date, score_str, method_or_tissue]) + result += s + '\n' + + f = open(result_fname, 'w') + f.write(result) + f.close() + + +def target_tf_file_compare_same(fname1, fname2): + if not os.path.exists(fname1): + return False + if not os.path.exists(fname2): + return False + f1 = open(fname1) + s1 = f1.read() + f1.close() + f2 = open(fname2) + s2 = f2.read() + f2.close() + return s1 == s2 + + +## main +param_file = sys.argv[1] # a single prameter file +glb_param_dict = make_global_param_dict(param_file) +agi2name_dict = make_gene_name_AGI_map_dict(glb_param_dict['GENE_ID_AND_GENE_NAME']) + +target_tf_fname = '../Data/information/target_tf.txt' +if not os.path.exists(target_tf_fname): + print('create_edges0: file %s does not exist. Produce this file use make_target_tf.py.' % (target_tf_fname)) + sys.exit() + +all_targets, all_tfs = get_targets_and_tfs(target_tf_fname) +write_lst_to_file(all_targets, TARGET_FILE) +write_lst_to_file(all_tfs, TF_FILE) + +data_file, num_rnaseq = edit_headline(glb_param_dict['EXPRESSION_MATRIX']) + +make_r_script(R_SCRIPT_FILE, RESULT_FILE, data_file, TARGET_FILE, TF_FILE, 0.60) + +cmd = 'Rscript %s' % (R_SCRIPT_FILE) +os.system(cmd) + +if not os.path.isdir(HISTORY_DIR): + os.makedirs(HISTORY_DIR) + +curr_time = datetime.now().strftime('%Y%m%d_%H%M%S') +result_fname = os.path.join(HISTORY_DIR, 'edges.txt.simple.correlation.all.conditions.' + curr_time) +establish_edges(RESULT_FILE, target_tf_fname, result_fname, agi2name_dict, num_rnaseq, glb_param_dict) # change + +cmd = 'rm -f %s %s %s %s %s' % (data_file, TARGET_FILE, TF_FILE, R_SCRIPT_FILE, RESULT_FILE) +os.system(cmd) +print('Done. Check %s.' % (result_fname)) -- cgit v1.2.1