summaryrefslogtreecommitdiff
path: root/Code/assign_tissue.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/assign_tissue.py')
-rw-r--r--Code/assign_tissue.py50
1 files changed, 26 insertions, 24 deletions
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py
index 782ab78..4a81e18 100644
--- a/Code/assign_tissue.py
+++ b/Code/assign_tissue.py
@@ -6,9 +6,9 @@
#
# 2 June 2017, slcu, hui
# Last modified 19 June 2017, slcu, hui
+# Last modified 5 November 2022, hui
import os, sys, json
-import urllib2
def make_tissue_dict(fname):
f = open(fname)
@@ -42,7 +42,7 @@ def make_sample_dict(fname):
lines = f.readlines()
f.close()
d = {}
- for line in lines[1:]:
+ for line in lines[1:]: # skip the head line
line = line.strip()
lst = line.split('\t')
if len(lst) >= 2:
@@ -94,24 +94,23 @@ def stringfy_json(d):
def make_information(s, info_dir):
lst = s.split('...')
- sample_id = lst[0]
+ sample_id = lst[0] # looks like SAMD00012930
filename = '%s/%s.json' % (info_dir, sample_id)
- #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id)
if not os.path.exists(filename):
- cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)
+ #cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)
+ cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/samples/%s.json > %s' % (sample_id, filename)
os.system(cmd)
f = open(filename)
d = json.load(f)
f.close()
- #d = json.load(urllib2.urlopen(url))
- if len(d["_embedded"]["samples"]) > 0:
- return stringfy_json(d["_embedded"]["samples"][0])
+ if 'tissue' in d['characteristics']:
+ return d['characteristics']['tissue'][0]['text'].lower() + '\t' + sample_id
else:
- return '.\t.'
+ return '.' + '\t' + sample_id
# main
-BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here
+BIOSAMPLE_INFO_DIR = '../Data/information/BioSample' # put downloaded BioSample json files here
if not os.path.isdir(BIOSAMPLE_INFO_DIR):
os.makedirs(BIOSAMPLE_INFO_DIR)
@@ -122,18 +121,21 @@ os.system(cmd)
lst = get_experiment_id('../Data/temp/a.txt')
d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description)
-d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
+d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt.temp') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
head = ''
-for x in lst:
- k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
- s = x
- if k in d:
- s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8')
- elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
- s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
- elif x.startswith('R0002'): # pcubas (Spain) data
- s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
- else:
- s += '\t' + '\t'.join(9*['.']) # k is gene_id
- print(s)
- head += s.replace('\t', '_') + '\t'
+# run.id inferred.tissue biosample.tissue biosample.id
+with open('../Data/temp/experiment.and.tissue.1.txt', 'w', encoding='utf8') as f:
+ for x in lst:
+ s = x # run id, such as R0DRR016125XXX
+ k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
+ if k in d:
+ s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0], BIOSAMPLE_INFO_DIR)
+ elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
+ s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
+ elif x.startswith('R0002'): # pcubas (Spain) data
+ s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
+ else:
+ s += '\t' + '\t'.join(9*['.']) # k is gene_id
+ f.write(s + '\n')
+ head += s.replace('\t', '_') + '\t'
+