summaryrefslogtreecommitdiff
path: root/Code/assign_tissue.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/assign_tissue.py
brain: add python and R code to local repository.
Diffstat (limited to 'Code/assign_tissue.py')
-rw-r--r--Code/assign_tissue.py139
1 files changed, 139 insertions, 0 deletions
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py
new file mode 100644
index 0000000..782ab78
--- /dev/null
+++ b/Code/assign_tissue.py
@@ -0,0 +1,139 @@
+# Usage: python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt
+# Set TPM_FILE = , d = and d2 =
+#
+# Purpose: for each RNA-seq column in the TPM_FILE, get its tissue information
+# Excute this command to avoid encoding error: export PYTHONIOENCODING=UTF-8
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import os, sys, json
+import urllib2
+
+def make_tissue_dict(fname):
+ f = open(fname)
+ d = json.load(f)
+ f.close()
+ return d
+
+def get_experiment_id(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ result = []
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) >= 2:
+ result.append(lst[1])
+ return result
+
+def get_sra_id(x):
+ if 'RR' in x:
+ index1 = x.find('RR')
+ index2 = x.find('X')
+ if index2 == -1:
+ index2 = len(x)
+ return x[index1-1:index2]
+ return x
+
+def make_sample_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) >= 2:
+ runid = lst[0]
+ sample = lst[1]
+ d[runid] = (sample, ';'.join(lst[2:]))
+ return d
+
+
+def stringfy_json(d):
+ s = ''
+
+ if "organismPart" in d["characteristics"]:
+ s = d["characteristics"]["organismPart"][0]["text"]
+ else:
+ s = 'part_unknown'
+
+ if "accession" in d:
+ s += '\t' + d["accession"]
+ else:
+ s += '\t' + '.'
+
+ if "name" in d:
+ s += '\t' + d["name"]
+ else:
+ s += '\t' + '.'
+
+ if "synonym" in d["characteristics"]:
+ s += '\t' + d["characteristics"]["synonym"][0]["text"]
+ else:
+ s += '\t' + '.'
+
+ if "ecotype" in d["characteristics"]:
+ s += '\t' + d["characteristics"]["ecotype"][0]["text"]
+ else:
+ s += '\t' + '.'
+
+ if "developmentStage" in d["characteristics"]:
+ s += '\t' + d["characteristics"]["developmentStage"][0]["text"]
+ else:
+ s += '\t' + '.'
+
+ if "description" in d:
+ s += '\t' + d["description"] if d["description"] != None else '.'
+ else:
+ s += '\t' + '.'
+
+ return s
+
+def make_information(s, info_dir):
+ lst = s.split('...')
+ sample_id = lst[0]
+ filename = '%s/%s.json' % (info_dir, sample_id)
+ #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id)
+ if not os.path.exists(filename):
+ cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)
+ os.system(cmd)
+
+ f = open(filename)
+ d = json.load(f)
+ f.close()
+ #d = json.load(urllib2.urlopen(url))
+ if len(d["_embedded"]["samples"]) > 0:
+ return stringfy_json(d["_embedded"]["samples"][0])
+ else:
+ return '.\t.'
+
+# main
+BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here
+if not os.path.isdir(BIOSAMPLE_INFO_DIR):
+ os.makedirs(BIOSAMPLE_INFO_DIR)
+
+# get first row in the TPM file
+TPM_FILE = '../Data/history/expr/TPM.txt'
+cmd = 'head -1 %s | perl -ne \'@words = split /\t/; $count = 1; for $x (@words) {print $count, "\t", $x, "\n"; $count++}\' > ../Data/temp/a.txt' % (TPM_FILE)
+os.system(cmd)
+
+lst = get_experiment_id('../Data/temp/a.txt')
+d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description)
+d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
+head = ''
+for x in lst:
+ k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
+ s = x
+ if k in d:
+ s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8')
+ elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
+ s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
+ elif x.startswith('R0002'): # pcubas (Spain) data
+ s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
+ else:
+ s += '\t' + '\t'.join(9*['.']) # k is gene_id
+ print(s)
+ head += s.replace('\t', '_') + '\t'