# Usage: python update_rnaseq_info_json.py # Provide two files old_json and tissue_file # # Purpose: update the tissue field in rnaseq_info_database.json. Make # Data/information/experiment.and.tissue.txt in which rnaseq samples # with unknown tissues are predicted using knn_classify.R with K=1. # # 2 June 2017, slcu, hui # Last modified 19 June 2017, slcu, hui import json, os, sys def get_sra_id(x): if 'RR' in x: index1 = x.find('RR') index2 = x.find('X') if index2 == -1: index2 = len(x) return x[index1-1:index2] return x def make_tissue_dict(fname): f = open(fname) lines = f.readlines() d = {} for line in lines[1:]: line = line.strip() lst = line.split('\t') x = lst[0] y = get_sra_id(x) d[y] = lst[4] f.close() return d def update_tissue_dict_and_tissue_file(d, fname, fname_pred): f = open(fname_pred) # predicted file, columns are sample.name and predicted.tissue lines = f.readlines() f.close() for line in lines[1:]: line = line.strip() lst = line.split('\t') if line != '' and len(lst) >= 2: y = get_sra_id(lst[0]) d[y] = lst[1] f = open(fname) lines = f.readlines() head_line = lines[0].strip() f.close() file_lines = [head_line] for line in lines[1:]: line = line.strip() lst = line.split('\t') if line != '' and len(lst) >= 5: k = get_sra_id(lst[0]) if lst[4] == 'unknown' and k in d: lst[4] = d[k] file_lines.append('\t'.join(lst)) outfile = '../Data/information/experiment.and.tissue.txt' # so that outfile dose not contain unknown f = open(outfile, 'w') f.write('\n'.join(file_lines) + '\n') f.close() return d # main RSCRIPT_FILE = 'knn_classify.R' old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py tissue_file = '../Data/information/experiment.and.tissue.2.txt' # generated by refine_tissue.py tissue_dict = make_tissue_dict(tissue_file) if os.path.exists(RSCRIPT_FILE): cmd = 'Rscript %s' % (RSCRIPT_FILE) # generate ../Data/temp/predicted.label.txt os.system(cmd) tissue_dict = update_tissue_dict_and_tissue_file(tissue_dict, tissue_file, '../Data/temp/predicted.label.txt') with open(old_json) as json_data: json_dict = json.load(json_data) for k in json_dict: if k in tissue_dict: json_dict[k]['tissue'] = tissue_dict[k] cmd = 'cp %s ../Data/information/rnaseq_info_database.json.old' % (old_json) os.system(cmd) fname = old_json with open(fname, 'w') as f: json.dump(json_dict, f, indent=4) print('Check updated %s.' % (old_json))