diff options
Diffstat (limited to 'Code/assign_tissue.py')
-rw-r--r-- | Code/assign_tissue.py | 50 |
1 files changed, 26 insertions, 24 deletions
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py index 782ab78..4a81e18 100644 --- a/Code/assign_tissue.py +++ b/Code/assign_tissue.py @@ -6,9 +6,9 @@ # # 2 June 2017, slcu, hui # Last modified 19 June 2017, slcu, hui +# Last modified 5 November 2022, hui import os, sys, json -import urllib2 def make_tissue_dict(fname): f = open(fname) @@ -42,7 +42,7 @@ def make_sample_dict(fname): lines = f.readlines() f.close() d = {} - for line in lines[1:]: + for line in lines[1:]: # skip the head line line = line.strip() lst = line.split('\t') if len(lst) >= 2: @@ -94,24 +94,23 @@ def stringfy_json(d): def make_information(s, info_dir): lst = s.split('...') - sample_id = lst[0] + sample_id = lst[0] # looks like SAMD00012930 filename = '%s/%s.json' % (info_dir, sample_id) - #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id) if not os.path.exists(filename): - cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename) + #cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename) + cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/samples/%s.json > %s' % (sample_id, filename) os.system(cmd) f = open(filename) d = json.load(f) f.close() - #d = json.load(urllib2.urlopen(url)) - if len(d["_embedded"]["samples"]) > 0: - return stringfy_json(d["_embedded"]["samples"][0]) + if 'tissue' in d['characteristics']: + return d['characteristics']['tissue'][0]['text'].lower() + '\t' + sample_id else: - return '.\t.' + return '.' + '\t' + sample_id # main -BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here +BIOSAMPLE_INFO_DIR = '../Data/information/BioSample' # put downloaded BioSample json files here if not os.path.isdir(BIOSAMPLE_INFO_DIR): os.makedirs(BIOSAMPLE_INFO_DIR) @@ -122,18 +121,21 @@ os.system(cmd) lst = get_experiment_id('../Data/temp/a.txt') d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description) -d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run +d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt.temp') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run head = '' -for x in lst: - k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX - s = x - if k in d: - s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8') - elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data - s += '\t' + 'seedling\t' + '\t'.join(8*['.']) - elif x.startswith('R0002'): # pcubas (Spain) data - s += '\t' + 'meristem\t' + '\t'.join(8*['.']) - else: - s += '\t' + '\t'.join(9*['.']) # k is gene_id - print(s) - head += s.replace('\t', '_') + '\t' +# run.id inferred.tissue biosample.tissue biosample.id +with open('../Data/temp/experiment.and.tissue.1.txt', 'w', encoding='utf8') as f: + for x in lst: + s = x # run id, such as R0DRR016125XXX + k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX + if k in d: + s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0], BIOSAMPLE_INFO_DIR) + elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data + s += '\t' + 'seedling\t' + '\t'.join(8*['.']) + elif x.startswith('R0002'): # pcubas (Spain) data + s += '\t' + 'meristem\t' + '\t'.join(8*['.']) + else: + s += '\t' + '\t'.join(9*['.']) # k is gene_id + f.write(s + '\n') + head += s.replace('\t', '_') + '\t' + |