# Usage: python slice_TPM_to_JSON.py parameter_for_net.txt # # Purpose: Given the matrix TPM.txt, make logarithmised gene # expression in json format for each gene. Put the results in # JSON_DIR. The results are used for displaying scatterplots in # Webapp. # # Last modified 24 Apr 2017, slcu, hui [use r to do the job, faster] import sys, os, operator, itertools import numpy as np import json JSON_DIR = '../Data/history/expr/json' # contain json for all genes, one json file for each gene. Each json file has the following format {"R0ERR046550XXX": 2.8148097376737438, "R0ERR031542XXX": 2.5193080765053328, ...} GLB_PARAM_SYMBOL = '%%' DATA_SYMBOL = '@' # read expression TPM def read_matrix_data(fname): ''' fname - a file, first line is head, first column is row name. ''' lineno = 0 colid = [] rowid = [] d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...} d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...} d3 = {} # {gene1: [], gene2: [], ...} d4 = {} # {cond1:[], cond2:[], ...} f = open(fname) lines = f.readlines() f.close() head_line = lines[0].strip() lst = head_line.split() colid = lst[1:] for c in colid: d2[c] = {} d4[c] = [] for line in lines[1:]: line = line.strip() lst = line.split() g = lst[0] rowid.append(g) d[g] = {} levels = lst[1:] if len(levels) != len(colid): print('Incomplete columns at row %s' % (g)) sys.exit() d3[g] = [] for i in range(len(colid)): c = colid[i] d[g][c] = float(levels[i]) d2[c][g] = float(levels[i]) d3[g].append(float(levels[i])) d4[c].append(float(levels[i])) lineno += 1 d_return = {} d_return['xy'] = d # first gene, then condition d_return['yx'] = d2 # first condition, then gene d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column d_return['nrow'] = lineno - 1 d_return['ncol'] = len(colid) d_return['rowid'] = rowid d_return['colid'] = colid # d4_sorted = {} # for k in d4: # d4_sorted[k] = sorted(d4[k], reverse=True) # d_return['yy_sorted'] = d4_sorted return d_return def get_key_value(s): lst = s.split('=') k, v = lst[0], lst[1] return (k.strip(), v.strip()) def make_global_param_dict(fname): f = open(fname) d = {} for line in f: line = line.strip() if line.startswith(GLB_PARAM_SYMBOL): s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] lst = s.split('\t') # separate items by TAB for x in lst: if x != '': k, v = get_key_value(x) d[k] = v f.close() return d def take_log(x): return np.log(x+1) def make_json_file(expr_dict, dir_name, glb_param_dict): if not os.path.isdir(dir_name): # create the directory if not exist os.makedirs(dir_name) d = expr_dict['xy'] col_name_lst = expr_dict['colid'] row_name_lst = expr_dict['rowid'] for g in row_name_lst: #print(g) d2 = d[g] if glb_param_dict['LOGRITHMIZE'].upper() == 'YES': d3 = {k: take_log(v) for k, v in d2.items()} else: d3 = d2 filename = os.path.join(dir_name, g + '.json') with open(filename, 'w') as f: json.dump(d3, f) def make_json_file_using_r(dir_name, glb_param_dict): # use r script to make it faster r_code = ''' library(jsonlite) dir.name <- '%s' tpm.file <- '%s' take.log <- '%s' X <- read.table(tpm.file, header=T, check.names=FALSE, sep="\\t") gene.id <- as.vector(X[,1]) X[,1] <- NULL # remove first column if (take.log == 'YES') { X <- log(X+1) } if (!dir.exists(dir.name)) { dir.create(dir.name) } for (i in 1:dim(X)[1]) { y <- toJSON(unbox(X[i,]), digits=I(3), pretty=TRUE) file.name = paste(dir.name, paste(gene.id[i], 'json', sep='.'), sep='/') cat(y, file=file.name) } ''' % ( dir_name, glb_param_dict['EXPRESSION_MATRIX'], glb_param_dict['LOGRITHMIZE'].upper()) f = open('slice_TPM_to_JSON.R', 'w') # make a R script f.write('\n'.join([line.lstrip('\t') for line in r_code.split('\n')])) f.close() os.system('Rscript slice_TPM_to_JSON.R') os.system('rm -f slice_TPM_to_JSON.R') ## main param_file = sys.argv[1] # a single prameter file glb_param_dict = make_global_param_dict(param_file) #expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX']) #make_json_file(expr_dict, JSON_DIR, glb_param_dict) # slower version make_json_file_using_r(JSON_DIR, glb_param_dict) # faster version