summaryrefslogtreecommitdiff
path: root/Code/update_rnaseq_info_json.py
blob: 4d6b65438ac60e4a94db2fa389f5d528fb24f94b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Usage: python update_rnaseq_info_json.py
#        Provide two files old_json and tissue_file
#
# Purpose: update the tissue field in rnaseq_info_database.json.  Make
# Data/information/experiment.and.tissue.txt in which rnaseq samples
# with unknown tissues are predicted using knn_classify.R with K=1.
#
# 2 June 2017, slcu, hui
# Last modified 19 June 2017, slcu, hui

import json, os, sys

def get_sra_id(x):
    if 'RR' in x:
        index1 = x.find('RR')
        index2 = x.find('X')
        if index2 == -1:
            index2 = len(x)
        return x[index1-1:index2]
    return x

def make_tissue_dict(fname):
    f = open(fname)
    lines = f.readlines()
    d = {}
    for line in lines[1:]:
        line = line.strip()
        lst = line.split('\t')
        x = lst[0]
        y = get_sra_id(x)
        d[y] = lst[4]
    f.close()
    return d

def update_tissue_dict_and_tissue_file(d, fname, fname_pred):
    
    f = open(fname_pred) # predicted file, columns are sample.name and predicted.tissue
    lines = f.readlines()
    f.close()
    for line in lines[1:]:
        line = line.strip()
        lst = line.split('\t')        
        if line != '' and len(lst) >= 2:
            y = get_sra_id(lst[0])
            d[y] = lst[1]

    f = open(fname)
    lines = f.readlines()
    head_line = lines[0].strip()    
    f.close()
    file_lines = [head_line]
    for line in lines[1:]:
        line = line.strip()
        lst = line.split('\t')        
        if line != '' and len(lst) >= 5:
            k = get_sra_id(lst[0])
            if lst[4] == 'unknown' and k in d:
                lst[4] = d[k]
        file_lines.append('\t'.join(lst))
    outfile = '../Data/information/experiment.and.tissue.txt' # so that outfile dose not contain unknown
    f = open(outfile, 'w')
    f.write('\n'.join(file_lines) + '\n')
    f.close()
    return d


# main
RSCRIPT_FILE = 'knn_classify.R'
old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py
tissue_file = '../Data/information/experiment.and.tissue.2.txt'   # generated by refine_tissue.py
tissue_dict = make_tissue_dict(tissue_file)
if os.path.exists(RSCRIPT_FILE):
    cmd = 'Rscript %s' % (RSCRIPT_FILE) # generate ../Data/temp/predicted.label.txt
    os.system(cmd)
    tissue_dict = update_tissue_dict_and_tissue_file(tissue_dict, tissue_file, '../Data/temp/predicted.label.txt')

with open(old_json) as json_data:
        json_dict = json.load(json_data)
        for k in json_dict:
            if k in tissue_dict:
                json_dict[k]['tissue'] = tissue_dict[k]

cmd = 'cp %s ../Data/information/rnaseq_info_database.json.old' % (old_json)
os.system(cmd)
fname = old_json
with open(fname, 'w') as f:
    json.dump(json_dict, f, indent=4)

print('Check updated %s.' % (old_json))