diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/slice_TPM_to_JSON.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/slice_TPM_to_JSON.py')
-rw-r--r-- | Code/slice_TPM_to_JSON.py | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/Code/slice_TPM_to_JSON.py b/Code/slice_TPM_to_JSON.py new file mode 100644 index 0000000..e597b78 --- /dev/null +++ b/Code/slice_TPM_to_JSON.py @@ -0,0 +1,164 @@ +# Usage: python slice_TPM_to_JSON.py parameter_for_net.txt +# +# Purpose: Given the matrix TPM.txt, make logarithmised gene +# expression in json format for each gene. Put the results in +# JSON_DIR. The results are used for displaying scatterplots in +# Webapp. +# +# Last modified 24 Apr 2017, slcu, hui [use r to do the job, faster] + +import sys, os, operator, itertools +import numpy as np +import json + +JSON_DIR = '../Data/history/expr/json' # contain json for all genes, one json file for each gene. Each json file has the following format {"R0ERR046550XXX": 2.8148097376737438, "R0ERR031542XXX": 2.5193080765053328, ...} + +GLB_PARAM_SYMBOL = '%%' +DATA_SYMBOL = '@' + +# read expression TPM +def read_matrix_data(fname): + ''' + fname - a file, first line is head, first column is row name. + ''' + + lineno = 0 + colid = [] + rowid = [] + d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...} + d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...} + d3 = {} # {gene1: [], gene2: [], ...} + d4 = {} # {cond1:[], cond2:[], ...} + + f = open(fname) + lines = f.readlines() + f.close() + + head_line = lines[0].strip() + lst = head_line.split() + colid = lst[1:] + + for c in colid: + d2[c] = {} + d4[c] = [] + + for line in lines[1:]: + line = line.strip() + lst = line.split() + g = lst[0] + rowid.append(g) + d[g] = {} + levels = lst[1:] + if len(levels) != len(colid): + print('Incomplete columns at row %s' % (g)) + sys.exit() + + d3[g] = [] + for i in range(len(colid)): + c = colid[i] + d[g][c] = float(levels[i]) + d2[c][g] = float(levels[i]) + d3[g].append(float(levels[i])) + d4[c].append(float(levels[i])) + lineno += 1 + + d_return = {} + d_return['xy'] = d # first gene, then condition + d_return['yx'] = d2 # first condition, then gene + d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row + d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column + d_return['nrow'] = lineno - 1 + d_return['ncol'] = len(colid) + d_return['rowid'] = rowid + d_return['colid'] = colid + + # d4_sorted = {} + # for k in d4: + # d4_sorted[k] = sorted(d4[k], reverse=True) + # d_return['yy_sorted'] = d4_sorted + + return d_return + + +def get_key_value(s): + lst = s.split('=') + k, v = lst[0], lst[1] + return (k.strip(), v.strip()) + + +def make_global_param_dict(fname): + f = open(fname) + d = {} + for line in f: + line = line.strip() + if line.startswith(GLB_PARAM_SYMBOL): + s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] + lst = s.split('\t') # separate items by TAB + for x in lst: + if x != '': + k, v = get_key_value(x) + d[k] = v + f.close() + return d + + +def take_log(x): + return np.log(x+1) + + +def make_json_file(expr_dict, dir_name, glb_param_dict): + if not os.path.isdir(dir_name): # create the directory if not exist + os.makedirs(dir_name) + + d = expr_dict['xy'] + col_name_lst = expr_dict['colid'] + row_name_lst = expr_dict['rowid'] + for g in row_name_lst: + #print(g) + d2 = d[g] + if glb_param_dict['LOGRITHMIZE'].upper() == 'YES': + d3 = {k: take_log(v) for k, v in d2.items()} + else: + d3 = d2 + filename = os.path.join(dir_name, g + '.json') + with open(filename, 'w') as f: + json.dump(d3, f) + + +def make_json_file_using_r(dir_name, glb_param_dict): # use r script to make it faster + r_code = ''' + library(rjson) + dir.name <- '%s' + tpm.file <- '%s' + take.log <- '%s' + X <- read.table(tpm.file, header=T, check.names=FALSE, sep="\\t") + gene.id <- as.vector(X[,1]) + X[,1] <- NULL # remove first column + if (take.log == 'YES') { + X <- log(X+1) + } + if (!dir.exists(dir.name)) { + dir.create(dir.name) + } + for (i in 1:dim(X)[1]) { + y <- toJSON(X[i,]) + file.name = paste(dir.name, paste(gene.id[i], 'json', sep='.'), sep='/') + cat(y, file=file.name) + } + ''' % ( + dir_name, + glb_param_dict['EXPRESSION_MATRIX'], + glb_param_dict['LOGRITHMIZE'].upper()) + f = open('slice_TPM_to_JSON.R', 'w') # make a R script + f.write('\n'.join([line.lstrip('\t') for line in r_code.split('\n')])) + f.close() + os.system('Rscript slice_TPM_to_JSON.R') + os.system('rm -f slice_TPM_to_JSON.R') + + +## main +param_file = sys.argv[1] # a single prameter file +glb_param_dict = make_global_param_dict(param_file) +#expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX']) +#make_json_file(expr_dict, JSON_DIR, glb_param_dict) # slower version +make_json_file_using_r(JSON_DIR, glb_param_dict) # faster version |