From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/create_edges4_k2.py | 253 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 Code/create_edges4_k2.py (limited to 'Code/create_edges4_k2.py') diff --git a/Code/create_edges4_k2.py b/Code/create_edges4_k2.py new file mode 100644 index 0000000..cffd160 --- /dev/null +++ b/Code/create_edges4_k2.py @@ -0,0 +1,253 @@ +# Usage: python create_edges4.py parameter_for_net.txt +# Purpose: +# This script will generate MAX_PROCESS work_on_AGI#.R scripts, each for a target gene. So treat each target separately. +# The results will be saved as edges.txt.AT3G12580.20170308, where AT3G12580 is target gene id, and 20170308 is date. +# The edges.txt files will be merged together later. +# Hopeful it will be faster. +# Make it faster by handling each target separately. +# Make memory footprint smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first. +# So we don't need to load the big matrices, TPM.txt and binding.txt. +# +# 7 Mar 2017, slcu, hui + +import sys, os, operator, itertools +from datetime import datetime +import time +import json +import subprocess +from geneid2name import make_gene_name_AGI_map_dict +from param4net import make_global_param_dict + +EDGE_FILE = '../Data/history/edges/edges.txt' +EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene +GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' # for gene names +TIME_INTERVAL = 10 # wait this long in seconds between before launching a R Rscript +MAX_PROCESS = 5 # CHANGE +K = 2 # CHANGE number of components to use + +#################################### +DATA_SYMBOL = '@' +#################################### + +def get_gene_list(fname): + result = [] + f = open(fname) + for line in f: + line = line.strip() + lst = line.split() + result.append(lst[0]) + f.close() + return result + +def get_ordered_gene_list(fname): + gene_list = get_gene_list(fname) + d = {} + f = open(EDGE_FILE) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + lst = line.split('\t') + target = lst[0].split()[0] + tf = lst[1].split()[0] + if not target in d: + d[target] = 1 + else: + d[target] += 1 + + result_gene_lst = [] + for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top + g = t[0] + if g in gene_list: + result_gene_lst.append(g) + return result_gene_lst + + +def make_tf_dict(fname): + d = {} + f = open(fname) + for line in f: + line = line.strip() + lst = line.split('\t') + target = lst[0] + tf = lst[1] + cond = lst[2] + if not target in d: + d[target] = {tf:cond} + else: + d[target][tf] = cond + f.close() + return d + +def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component): + head = 'k.lst <- c(%d)\n' % (num_component) + head += 'target <- \'%s\'\n' % (target) + head += 'id2 <- target\n' + tfs = '' + conds = '' + for k in tf_dict.keys(): + tfs += '\'%s\',' % (k) + conds += '\'%s\',' % (tf_dict[k]) + head += 'tfs <- c(' + tfs.rstrip(',') + ')\n' + head += 'conditions <- c(' + conds.rstrip(',') + ')\n' + head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir) + head += 'AGINAME_FILE <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME)) + head += 'output.file <- paste(\'%s/edges.txt\', id2, gsub(\'-\',\'\',Sys.Date()), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component) + body = ''' + ####### Read data ######################################### + CORR_THRESHOLD <- 0.7 + MIN_SIZE <- 100 + agi <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes + ####################################################### + library(mixtools) + library(rjson) + name2 <- agi[id2,1] + result <- '' + for (i in 1:length(tfs)) { + curr.date <- gsub('-','',Sys.Date()) + id1 <- tfs[i] + name1 <- agi[id1,1] + cond <- conditions[i] + + file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/') + if (!file.exists(file.x)) { next } + x <- as.data.frame(fromJSON(file = file.x)) + x <- log(x+1) + rcond.x <- names(x) + x <- as.vector(t(x)) # convert it to a vector + + file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/') + if (!file.exists(file.y)) { break } + y <- as.data.frame(fromJSON(file = file.y)) + y <- log(y+1) + rcond.y <- names(y) + y <- as.vector(t(y)) # convert it to a vector + + rna.sample.id <- rcond.x + if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore + next + } + + index <- x < 0.01 | y < 0.01 # don't include data that is too small + x <- x[!index] + y <- y[!index] + r <- cor(x, y) + if (abs(r) >= CORR_THRESHOLD) { + s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date) + result <- paste(result, s, sep='') + next # a good correlation is found using all experiments, so not necessary to look further + } + + rna.sample.id <- rna.sample.id[!index] # important to make the index work + + pos_r_max <- -2 + pos_r_N <- 0 + pos_r_index <- c() + pos_r_loglik <- -100000000 + + neg_r_max <- 2 + neg_r_N <- 0 + neg_r_index <- c() + neg_r_loglik <- -100000000 + + for (k in k.lst) { + em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k) + for (j in seq(1,k,1)) { + index <- which(max.col(em.out$posterior) == j) + size <- length(index) + r <- cor(em.out$x[index,2], em.out$y[index]) + if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) { + pos_r_max <- r + pos_r_N <- size + pos_r_index <- index + pos_r_loglik <- em.out$loglik + } + if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) { + neg_r_max <- r + neg_r_N <- size + neg_r_index <- index + neg_r_loglik <- em.out$loglik + } + } + } + + if (pos_r_max > 0) { # has a good positive correlation + sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ') + s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date) + result <- paste(result, s, sep='') + } + if (neg_r_max < 0) { # has a good negative correlation + sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ') + s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date) + result <- paste(result, s, sep='') + } + } + cat(result, file=output.file, sep='') + ''' + f = open(fname, 'w') + content = head + body + f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')])) + f.close() + return fname + +def wait_a_moment(n, prefix): + ''' if there are more than n work_on...R scripts running, wait... ''' + time.sleep(TIME_INTERVAL) + ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) # CHANGE + process_lst = ps.communicate()[0].split('\n') + while (len(process_lst) > n): + #print('number of running processes %d' % (len(process_lst))) + time.sleep(TIME_INTERVAL) + ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) + process_lst = ps.communicate()[0].split('\n') + +def establish_edges(jsonTPM_dir, d, glb_param_dict, rprefix): + ''' d - binding dictionary {target:{tf1:c1, tf2:c2}, ... }, c1 c2 are strings of conditions ''' + + gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority. For example, those targets never having an edge will be treated first + high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE + + if not os.path.isdir(EDGE_DIR): + os.makedirs(EDGE_DIR) + + # make a list of targets, putting high-priority target in the beginning + final_gene_lst = high_gene_lst + for x in gene_lst: + if not x in high_gene_lst: + final_gene_lst.append(x) + + # process each target + for target in final_gene_lst: # high priority genes are processed first + if target in d: # target g is in binding dictionary d + tf_dict = d[target] # in the form of {tf1:c1, tf2:c2} + if len(tf_dict) > 0: # it has TFs, usually it is the case + r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, K) + fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, K) + cmd = 'Rscript %s &' % (r_file) # run the Rscript in background + os.system(cmd) # UNCOMMENT ME + wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit. + +## main +param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt +glb_param_dict = make_global_param_dict(param_file) +agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) # for gene names + +#print('Make jsonTPM ...') # CHANGE +cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed. +os.system(cmd) +curr_time = datetime.now().strftime('%Y%m%d_%H%M%S') +JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory. +cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR) +os.system(cmd) + +#print('Make target tf using binding.txt') +target_tf_fname = '../Data/information/target_tf.txt.' + curr_time +cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program +os.system(cmd) + +#JSON_DIR = '../Data/history/expr/jsonTPM_20170310_1153' +#target_tf_fname = '../Data/information/target_tf.txt.20170310_1153' +#print('Establish edges') +big_tf_dict = make_tf_dict(target_tf_fname) +rscript_prefix = 'WORK%s' % (datetime.now().strftime('%Y%m%d_%H%M')) +establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, glb_param_dict, rscript_prefix) -- cgit v1.2.1