diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/assign_tissue.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/assign_tissue.py')
-rw-r--r-- | Code/assign_tissue.py | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py new file mode 100644 index 0000000..782ab78 --- /dev/null +++ b/Code/assign_tissue.py @@ -0,0 +1,139 @@ +# Usage: python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt +# Set TPM_FILE = , d = and d2 = +# +# Purpose: for each RNA-seq column in the TPM_FILE, get its tissue information +# Excute this command to avoid encoding error: export PYTHONIOENCODING=UTF-8 +# +# 2 June 2017, slcu, hui +# Last modified 19 June 2017, slcu, hui + +import os, sys, json +import urllib2 + +def make_tissue_dict(fname): + f = open(fname) + d = json.load(f) + f.close() + return d + +def get_experiment_id(fname): + f = open(fname) + lines = f.readlines() + f.close() + result = [] + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + if len(lst) >= 2: + result.append(lst[1]) + return result + +def get_sra_id(x): + if 'RR' in x: + index1 = x.find('RR') + index2 = x.find('X') + if index2 == -1: + index2 = len(x) + return x[index1-1:index2] + return x + +def make_sample_dict(fname): + f = open(fname) + lines = f.readlines() + f.close() + d = {} + for line in lines[1:]: + line = line.strip() + lst = line.split('\t') + if len(lst) >= 2: + runid = lst[0] + sample = lst[1] + d[runid] = (sample, ';'.join(lst[2:])) + return d + + +def stringfy_json(d): + s = '' + + if "organismPart" in d["characteristics"]: + s = d["characteristics"]["organismPart"][0]["text"] + else: + s = 'part_unknown' + + if "accession" in d: + s += '\t' + d["accession"] + else: + s += '\t' + '.' + + if "name" in d: + s += '\t' + d["name"] + else: + s += '\t' + '.' + + if "synonym" in d["characteristics"]: + s += '\t' + d["characteristics"]["synonym"][0]["text"] + else: + s += '\t' + '.' + + if "ecotype" in d["characteristics"]: + s += '\t' + d["characteristics"]["ecotype"][0]["text"] + else: + s += '\t' + '.' + + if "developmentStage" in d["characteristics"]: + s += '\t' + d["characteristics"]["developmentStage"][0]["text"] + else: + s += '\t' + '.' + + if "description" in d: + s += '\t' + d["description"] if d["description"] != None else '.' + else: + s += '\t' + '.' + + return s + +def make_information(s, info_dir): + lst = s.split('...') + sample_id = lst[0] + filename = '%s/%s.json' % (info_dir, sample_id) + #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id) + if not os.path.exists(filename): + cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename) + os.system(cmd) + + f = open(filename) + d = json.load(f) + f.close() + #d = json.load(urllib2.urlopen(url)) + if len(d["_embedded"]["samples"]) > 0: + return stringfy_json(d["_embedded"]["samples"][0]) + else: + return '.\t.' + +# main +BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here +if not os.path.isdir(BIOSAMPLE_INFO_DIR): + os.makedirs(BIOSAMPLE_INFO_DIR) + +# get first row in the TPM file +TPM_FILE = '../Data/history/expr/TPM.txt' +cmd = 'head -1 %s | perl -ne \'@words = split /\t/; $count = 1; for $x (@words) {print $count, "\t", $x, "\n"; $count++}\' > ../Data/temp/a.txt' % (TPM_FILE) +os.system(cmd) + +lst = get_experiment_id('../Data/temp/a.txt') +d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description) +d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run +head = '' +for x in lst: + k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX + s = x + if k in d: + s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8') + elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data + s += '\t' + 'seedling\t' + '\t'.join(8*['.']) + elif x.startswith('R0002'): # pcubas (Spain) data + s += '\t' + 'meristem\t' + '\t'.join(8*['.']) + else: + s += '\t' + '\t'.join(9*['.']) # k is gene_id + print(s) + head += s.replace('\t', '_') + '\t' |