brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/update_rnaseq_info_json.py
1 files changed, 89 insertions, 0 deletions
diff --git a/Code/update_rnaseq_info_json.py b/Code/update_rnaseq_info_json.py
new file mode 100644
index 0000000..4d6b654
--- /dev/null
+++ b/Code/update_rnaseq_info_json.py
@@ -0,0 +1,89 @@
+# Usage: python update_rnaseq_info_json.py
+#        Provide two files old_json and tissue_file
+#
+# Purpose: update the tissue field in rnaseq_info_database.json.  Make
+# Data/information/experiment.and.tissue.txt in which rnaseq samples
+# with unknown tissues are predicted using knn_classify.R with K=1.
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import json, os, sys
+
+def get_sra_id(x):
+    if 'RR' in x:
+        index1 = x.find('RR')
+        index2 = x.find('X')
+        if index2 == -1:
+            index2 = len(x)
+        return x[index1-1:index2]
+    return x
+
+def make_tissue_dict(fname):
+    f = open(fname)
+    lines = f.readlines()
+    d = {}
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split('\t')
+        x = lst[0]
+        y = get_sra_id(x)
+        d[y] = lst[4]
+    f.close()
+    return d
+
+def update_tissue_dict_and_tissue_file(d, fname, fname_pred):
+    
+    f = open(fname_pred) # predicted file, columns are sample.name and predicted.tissue
+    lines = f.readlines()
+    f.close()
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split('\t')        
+        if line != '' and len(lst) >= 2:
+            y = get_sra_id(lst[0])
+            d[y] = lst[1]
+
+    f = open(fname)
+    lines = f.readlines()
+    head_line = lines[0].strip()    
+    f.close()
+    file_lines = [head_line]
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split('\t')        
+        if line != '' and len(lst) >= 5:
+            k = get_sra_id(lst[0])
+            if lst[4] == 'unknown' and k in d:
+                lst[4] = d[k]
+        file_lines.append('\t'.join(lst))
+    outfile = '../Data/information/experiment.and.tissue.txt' # so that outfile dose not contain unknown
+    f = open(outfile, 'w')
+    f.write('\n'.join(file_lines) + '\n')
+    f.close()
+    return d
+
+
+# main
+RSCRIPT_FILE = 'knn_classify.R'
+old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py
+tissue_file = '../Data/information/experiment.and.tissue.2.txt'   # generated by refine_tissue.py
+tissue_dict = make_tissue_dict(tissue_file)
+if os.path.exists(RSCRIPT_FILE):
+    cmd = 'Rscript %s' % (RSCRIPT_FILE) # generate ../Data/temp/predicted.label.txt
+    os.system(cmd)
+    tissue_dict = update_tissue_dict_and_tissue_file(tissue_dict, tissue_file, '../Data/temp/predicted.label.txt')
+
+with open(old_json) as json_data:
+        json_dict = json.load(json_data)
+        for k in json_dict:
+            if k in tissue_dict:
+                json_dict[k]['tissue'] = tissue_dict[k]
+
+cmd = 'cp %s ../Data/information/rnaseq_info_database.json.old' % (old_json)
+os.system(cmd)
+fname = old_json
+with open(fname, 'w') as f:
+    json.dump(json_dict, f, indent=4)
+
+print('Check updated %s.' % (old_json))
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/update_rnaseq_info_json.py