summaryrefslogtreecommitdiff
path: root/Code/TPM2JSON.py
blob: 6d5a423f39f26164eeaf5d92f69ad5e647005329 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Usage: python TPM2JSON.py parameter_for_net.txt
# Purpose:
#   For each gene in TPM.txt, make a json file in directory JSON_DIR.  So we don't need to load the whole TPM.txt later (more memory efficient).
# 4 APR 2017, hui, slcu

import sys, os, operator, itertools
import numpy as np
import json
from param4net import make_global_param_dict

JSON_DIR = '../Data/history/expr/jsonTPM' # Don't change this

def read_matrix_data(fname):
    ''' 
    fname - a file, first line is head, first column is row name.
    '''
    
    lineno = 0
    colid = []
    rowid = []
    d =  {}  # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
    d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
    d3 = {} # {gene1: [], gene2: [], ...}
    d4 = {} # {cond1:[], cond2:[], ...}

    f = open(fname)
    lines = f.readlines()
    f.close()

    head_line = lines[0].strip()
    lst = head_line.split()
    colid = lst[1:]

    for c in colid:
        d2[c] = {}
        d4[c] = []
    
    for line in lines[1:]:
        line = line.strip()
        lst = line.split()
        g = lst[0]
        rowid.append(g)
        d[g] = {}
        levels = lst[1:]
        if len(levels) != len(colid):
            print('Incomplete columns at row %s' % (g))
            sys.exit()
            
        d3[g] = []
        for i in range(len(colid)):
            c = colid[i]
            d[g][c]  = float(levels[i])
            d2[c][g] = float(levels[i])
            d3[g].append(float(levels[i]))
            d4[c].append(float(levels[i]))
        lineno += 1

    d_return = {}
    d_return['xy'] = d  # first gene, then condition
    d_return['yx'] = d2 # first condition, then gene
    d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
    d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
    d_return['nrow'] = lineno - 1
    d_return['ncol'] = len(colid)
    d_return['rowid'] = rowid
    d_return['colid'] = colid    

    d4_sorted = {}
    for k in d4:
        d4_sorted[k] = sorted(d4[k], reverse=True)
    d_return['yy_sorted'] = d4_sorted

    return d_return

def check_json_file(expr_dict, dir_name):
    ''' Check if json files are good, return True if yes.  '''

    if not os.path.isdir(dir_name):
        return False

    d = expr_dict['xy']
    col_name_lst = expr_dict['colid']
    row_name_lst = expr_dict['rowid']
    for g in row_name_lst[1:10]: # check the first 10 lines
        d2 = d[g]
        filename = os.path.join(dir_name, g + '.json')
        if not os.path.exists(filename):
            return False
        with open(filename) as f:
            d3 = json.load(f)
        if len(d2) != len(d3):
            return False
        
    return True

def make_json_file(expr_dict, dir_name):
    if not os.path.isdir(dir_name): # create the directory if not exist
	os.makedirs(dir_name)

    d = expr_dict['xy']
    col_name_lst = expr_dict['colid']
    row_name_lst = expr_dict['rowid']
    for g in row_name_lst:
        d2 = d[g]
        filename = os.path.join(dir_name, g + '.json')
        with open(filename, 'w') as f:
            json.dump(d2, f)


## main
param_file = sys.argv[1] # a single prameter file
glb_param_dict = make_global_param_dict(param_file)
expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX'])
if not check_json_file(expr_dict, JSON_DIR):
    make_json_file(expr_dict, JSON_DIR)