From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/update_rnaseq_info_json.py | 89 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 Code/update_rnaseq_info_json.py (limited to 'Code/update_rnaseq_info_json.py') diff --git a/Code/update_rnaseq_info_json.py b/Code/update_rnaseq_info_json.py new file mode 100644 index 0000000..4d6b654 --- /dev/null +++ b/Code/update_rnaseq_info_json.py @@ -0,0 +1,89 @@ +# Usage: python update_rnaseq_info_json.py +# Provide two files old_json and tissue_file +# +# Purpose: update the tissue field in rnaseq_info_database.json. Make +# Data/information/experiment.and.tissue.txt in which rnaseq samples +# with unknown tissues are predicted using knn_classify.R with K=1. +# +# 2 June 2017, slcu, hui +# Last modified 19 June 2017, slcu, hui + +import json, os, sys + +def get_sra_id(x): + if 'RR' in x: + index1 = x.find('RR') + index2 = x.find('X') + if index2 == -1: + index2 = len(x) + return x[index1-1:index2] + return x + +def make_tissue_dict(fname): + f = open(fname) + lines = f.readlines() + d = {} + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + x = lst[0] + y = get_sra_id(x) + d[y] = lst[4] + f.close() + return d + +def update_tissue_dict_and_tissue_file(d, fname, fname_pred): + + f = open(fname_pred) # predicted file, columns are sample.name and predicted.tissue + lines = f.readlines() + f.close() + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + if line != '' and len(lst) >= 2: + y = get_sra_id(lst[0]) + d[y] = lst[1] + + f = open(fname) + lines = f.readlines() + head_line = lines[0].strip() + f.close() + file_lines = [head_line] + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + if line != '' and len(lst) >= 5: + k = get_sra_id(lst[0]) + if lst[4] == 'unknown' and k in d: + lst[4] = d[k] + file_lines.append('\t'.join(lst)) + outfile = '../Data/information/experiment.and.tissue.txt' # so that outfile dose not contain unknown + f = open(outfile, 'w') + f.write('\n'.join(file_lines) + '\n') + f.close() + return d + + +# main +RSCRIPT_FILE = 'knn_classify.R' +old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py +tissue_file = '../Data/information/experiment.and.tissue.2.txt' # generated by refine_tissue.py +tissue_dict = make_tissue_dict(tissue_file) +if os.path.exists(RSCRIPT_FILE): + cmd = 'Rscript %s' % (RSCRIPT_FILE) # generate ../Data/temp/predicted.label.txt + os.system(cmd) + tissue_dict = update_tissue_dict_and_tissue_file(tissue_dict, tissue_file, '../Data/temp/predicted.label.txt') + +with open(old_json) as json_data: + json_dict = json.load(json_data) + for k in json_dict: + if k in tissue_dict: + json_dict[k]['tissue'] = tissue_dict[k] + +cmd = 'cp %s ../Data/information/rnaseq_info_database.json.old' % (old_json) +os.system(cmd) +fname = old_json +with open(fname, 'w') as f: + json.dump(json_dict, f, indent=4) + +print('Check updated %s.' % (old_json)) -- cgit v1.2.1