# Usage: python TPM2JSON.py parameter_for_net.txt # Purpose: # For each gene in TPM.txt, make a json file in directory JSON_DIR. So we don't need to load the whole TPM.txt later (more memory efficient). # 4 APR 2017, hui, slcu import sys, os, operator, itertools import numpy as np import json from param4net import make_global_param_dict JSON_DIR = '../Data/history/expr/jsonTPM' # Don't change this def read_matrix_data(fname): ''' fname - a file, first line is head, first column is row name. ''' lineno = 0 colid = [] rowid = [] d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...} d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...} d3 = {} # {gene1: [], gene2: [], ...} d4 = {} # {cond1:[], cond2:[], ...} f = open(fname) lines = f.readlines() f.close() head_line = lines[0].strip() lst = head_line.split() colid = lst[1:] for c in colid: d2[c] = {} d4[c] = [] for line in lines[1:]: line = line.strip() lst = line.split() g = lst[0] rowid.append(g) d[g] = {} levels = lst[1:] if len(levels) != len(colid): print('Incomplete columns at row %s' % (g)) sys.exit() d3[g] = [] for i in range(len(colid)): c = colid[i] d[g][c] = float(levels[i]) d2[c][g] = float(levels[i]) d3[g].append(float(levels[i])) d4[c].append(float(levels[i])) lineno += 1 d_return = {} d_return['xy'] = d # first gene, then condition d_return['yx'] = d2 # first condition, then gene d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column d_return['nrow'] = lineno - 1 d_return['ncol'] = len(colid) d_return['rowid'] = rowid d_return['colid'] = colid d4_sorted = {} for k in d4: d4_sorted[k] = sorted(d4[k], reverse=True) d_return['yy_sorted'] = d4_sorted return d_return def check_json_file(expr_dict, dir_name): ''' Check if json files are good, return True if yes. ''' if not os.path.isdir(dir_name): return False d = expr_dict['xy'] col_name_lst = expr_dict['colid'] row_name_lst = expr_dict['rowid'] for g in row_name_lst[1:10]: # check the first 10 lines d2 = d[g] filename = os.path.join(dir_name, g + '.json') if not os.path.exists(filename): return False with open(filename) as f: d3 = json.load(f) if len(d2) != len(d3): return False return True def make_json_file(expr_dict, dir_name): if not os.path.isdir(dir_name): # create the directory if not exist os.makedirs(dir_name) d = expr_dict['xy'] col_name_lst = expr_dict['colid'] row_name_lst = expr_dict['rowid'] for g in row_name_lst: d2 = d[g] filename = os.path.join(dir_name, g + '.json') with open(filename, 'w') as f: json.dump(d2, f) ## main param_file = sys.argv[1] # a single prameter file glb_param_dict = make_global_param_dict(param_file) expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX']) if not check_json_file(expr_dict, JSON_DIR): make_json_file(expr_dict, JSON_DIR)