# Usage: python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt # Set TPM_FILE = , d = and d2 = # # Purpose: for each RNA-seq column in the TPM_FILE, get its tissue information # Excute this command to avoid encoding error: export PYTHONIOENCODING=UTF-8 # # 2 June 2017, slcu, hui # Last modified 19 June 2017, slcu, hui # Last modified 5 November 2022, hui import os, sys, json def make_tissue_dict(fname): f = open(fname) d = json.load(f) f.close() return d def get_experiment_id(fname): f = open(fname) lines = f.readlines() f.close() result = [] for line in lines[1:]: line = line.strip() lst = line.split('\t') if len(lst) >= 2: result.append(lst[1]) return result def get_sra_id(x): if 'RR' in x: index1 = x.find('RR') index2 = x.find('X') if index2 == -1: index2 = len(x) return x[index1-1:index2] return x def make_sample_dict(fname): f = open(fname) lines = f.readlines() f.close() d = {} for line in lines[1:]: # skip the head line line = line.strip() lst = line.split('\t') if len(lst) >= 2: runid = lst[0] sample = lst[1] d[runid] = (sample, ';'.join(lst[2:])) return d def stringfy_json(d): s = '' if "organismPart" in d["characteristics"]: s = d["characteristics"]["organismPart"][0]["text"] else: s = 'part_unknown' if "accession" in d: s += '\t' + d["accession"] else: s += '\t' + '.' if "name" in d: s += '\t' + d["name"] else: s += '\t' + '.' if "synonym" in d["characteristics"]: s += '\t' + d["characteristics"]["synonym"][0]["text"] else: s += '\t' + '.' if "ecotype" in d["characteristics"]: s += '\t' + d["characteristics"]["ecotype"][0]["text"] else: s += '\t' + '.' if "developmentStage" in d["characteristics"]: s += '\t' + d["characteristics"]["developmentStage"][0]["text"] else: s += '\t' + '.' if "description" in d: s += '\t' + d["description"] if d["description"] != None else '.' else: s += '\t' + '.' return s def make_information(s, info_dir): lst = s.split('...') sample_id = lst[0] # looks like SAMD00012930 filename = '%s/%s.json' % (info_dir, sample_id) if not os.path.exists(filename): #cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename) cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/samples/%s.json > %s' % (sample_id, filename) os.system(cmd) f = open(filename) d = json.load(f) f.close() if 'tissue' in d['characteristics']: return d['characteristics']['tissue'][0]['text'].lower() + '\t' + sample_id else: return '.' + '\t' + sample_id # main BIOSAMPLE_INFO_DIR = '../Data/information/BioSample' # put downloaded BioSample json files here if not os.path.isdir(BIOSAMPLE_INFO_DIR): os.makedirs(BIOSAMPLE_INFO_DIR) # get first row in the TPM file TPM_FILE = '../Data/history/expr/TPM.txt' cmd = 'head -1 %s | perl -ne \'@words = split /\t/; $count = 1; for $x (@words) {print $count, "\t", $x, "\n"; $count++}\' > ../Data/temp/a.txt' % (TPM_FILE) os.system(cmd) lst = get_experiment_id('../Data/temp/a.txt') d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description) d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt.temp') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run head = '' # run.id inferred.tissue biosample.tissue biosample.id with open('../Data/temp/experiment.and.tissue.1.txt', 'w', encoding='utf8') as f: for x in lst: s = x # run id, such as R0DRR016125XXX k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX if k in d: s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0], BIOSAMPLE_INFO_DIR) elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data s += '\t' + 'seedling\t' + '\t'.join(8*['.']) elif x.startswith('R0002'): # pcubas (Spain) data s += '\t' + 'meristem\t' + '\t'.join(8*['.']) else: s += '\t' + '\t'.join(9*['.']) # k is gene_id f.write(s + '\n') head += s.replace('\t', '_') + '\t'