# Usage: python parse_ena_xml.py > rnaseq_info_database.txt # # Search in this script for 'd_run', 'd_sample', 'd_experiment' and # 'd_study', and set their input files. The input files are generated # by download_ena_metadata.py (except for d_sample). It also # generates a json file called info_database.json, for displaying # experimental information in the scatterplot. If the input files are # for RNA-seq data, rename info_database.json to # rnaseq_info_database.json and move it to Data/information. Also move # rnaseq_info_database.txt to Data/information. They are used by # html_network.py. # # Purpose: Get description for RNA-seq data, one for each SRA Run ID. # Make rnaseq_info_database.txt and rnaseq_info_database.json. Each # line in rnaseq_info_database.txt contains information for a run id. # # NOTE: you might encounter UnicideEncodeError when running the # program. To avoid that, first type this command: # export PYTHONIOENCODING=UTF-8. # # 22 Feb 2017, slcu, hui # 12 Apr 2017, slcu, hui # 20 Apr 2017, slcu, hui # 30 May 2017, slcu, hui # 01 Jun 2017, slcu, hui [added a column sample_id] # 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.] # 10 Feb 2021, zjnu, hui [download latest run info files: information/ena_3702_read_run.xml, ena_3702_study.xml, ena_3702_sample.xml, information/ena_3702_read_experiment.xml;do not use SraRunTable_Ath_Tax3702.txt and d_run2 anymore.] import os, json, re, operator import xml.etree.ElementTree import sys MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file def parse_run(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() for c in root.findall('RUN'): acc = c.get('accession') d[acc] = {} alias = c.get('alias') d[acc]['alias'] = alias experiment = c.find('EXPERIMENT_REF').get('accession') d[acc]['experiment_id'] = experiment title = c.find('TITLE').text d[acc]['title'] = title d[acc]['study_id'] = '.' for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): s = i.text if 'RP' in s: # run project d[acc]['study_id'] = s break d[acc]['sample_id'] = '.' for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): s = i.text if 'RS' in s: # run project d[acc]['sample_id'] = s break return d def parse_study(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() for c in root.findall('PROJECT'): d2 = {} acc = c.find('./IDENTIFIERS/SECONDARY_ID') if acc != None: d2['secondary_id'] = acc.text else: d2['secondary_id'] = '.' d2['primary_id'] = c.get('accession') desc = c.find('DESCRIPTION') d2['description'] = 'None' if desc != None: d2['description'] = desc.text title = c.find('TITLE') d2['title'] = 'None' if title != None: d2['title'] = title.text run_id = '' for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'): s = i.text if 'RR' in s: run_id = s; break lst = run_id.split(',') for x in lst: lst2 = x.split('-') if len(lst2) == 1 and lst2[0] != '': k = lst2[0] d[k] = d2 # k is run id, such as SRR, ERR or DRR elif len(lst2) == 2: ss = lst2[0] ee = lst2[1] first_three_letters = ss[0:3] sz = len(ss) - 3 ss_t = int(ss[3:]) ee_t = int(ee[3:]) for j in range(ss_t, ee_t+1, 1): k = first_three_letters + str(j).zfill(sz) d[k] = d2 return d def parse_sample(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() for c in root.findall('SAMPLE'): d2 = {} acc = c.find('./IDENTIFIERS/EXTERNAL_ID') if acc != None: d2['external_id'] = acc.text else: d2['external_id'] = '.' d2['primary_id'] = c.get('accession') desc = c.find('DESCRIPTION') d2['description'] = 'None' if desc != None and desc.text != None: d2['description'] = desc.text title = c.find('TITLE') d2['title'] = 'None' if title != None and title.text != None: d2['title'] = title.text tissue_type = '' for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'): if i != None and i.text != None: tissue_type += i.text + ' ' d2['tissue'] = tissue_type.strip() run_id = '' for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'): s = i.text if 'RR' in s: run_id = s; break lst = run_id.split(',') for x in lst: lst2 = x.split('-') # e.g., SRR520490-SRR520491 if len(lst2) == 1 and lst2[0] != '': k = lst2[0] d[k] = d2 # k is run id, such as SRR, ERR or DRR elif len(lst2) == 2: ss = lst2[0] ee = lst2[1] first_three_letters = ss[0:3] sz = len(ss) - 3 ss_t = int(ss[3:]) ee_t = int(ee[3:]) for j in range(ss_t, ee_t+1, 1): k = first_three_letters + str(j).zfill(sz) d[k] = d2 return d def parse_experiment(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() for c in root.findall('EXPERIMENT'): d2 = {} d2['primary_id'] = c.get('accession') title = c.find('TITLE') d2['title'] = 'None' if title != None and title.text != None: d2['title'] = title.text desc = c.find('./DESIGN/DESIGN_DESCRIPTION') d2['description'] = 'None' if desc != None and desc.text != None: d2['description'] = desc.text strategy = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY') d2['library_strategy'] = 'None' # we look for RNA-Seq if strategy != None and strategy.text != None: d2['library_strategy'] = strategy.text source = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE') d2['library_source'] = 'None!' if source != None and source.text != None: d2['library_source'] = source.text run_id = '' for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'): s = i.text if 'RR' in s: run_id = s; break lst = run_id.split(',') for x in lst: lst2 = x.split('-') # e.g., SRR520490-SRR520491 if len(lst2) == 1 and lst2[0] != '': k = lst2[0] d[k] = d2 # k is run id, such as SRR, ERR or DRR elif len(lst2) == 2: ss = lst2[0] ee = lst2[1] first_three_letters = ss[0:3] sz = len(ss) - 3 ss_t = int(ss[3:]) ee_t = int(ee[3:]) for j in range(ss_t, ee_t+1, 1): k = first_three_letters + str(j).zfill(sz) d[k] = d2 return d def get_singular_form(w): d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'} if w in d: return d[w] return w def get_tissue(s): ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. ''' lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py # build a count dictionary, where key is a word d = {} s = s.lower() wlst = re.sub("[^\w]", " ", s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words for w in wlst: if w in lst: w2 = get_singular_form(w) if not w2 in d: d[w2] = 1 else: d[w2] += 1 if len(d) == 0: return 'unknown' tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True) result = '' for t in tlst: result += '%s(%d);' % (t[0], t[1]) return result.rstrip(';') ## main if __name__ == '__main__': # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT. So each # of the following function is call for each type of xml file. The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702 d_run = parse_run('../Data/information/ena_3702_read_run.xml') # RUN d_sample = parse_sample('../Data/information/ena_3702_sample.xml') # SAMPLE d_study = parse_study('../Data/information/ena_3702_read_study.xml') # STUDY d_experiment = parse_experiment('../Data/information/ena_3702_read_experiment.xml') # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection os.system(cmd) print('%s' % ('\t'.join(['run_id', 'sample_id', 'experiment_id', 'study_id', 'study_id_PRJ', 'title', 'alias', 'description', 'library_strategy', 'library_source']))) # description comes from three sources, STUDY, SAMPLE and EXPERIMENT d_run_keys = d_run.keys() d_run_keys = list(set(d_run_keys)) # Collect information for each run ID for k in sorted(d_run_keys): lst = [] lst.append(k) if k in d_run: if k in d_sample: if d_sample[k]['external_id'] != '.': lst.append(d_sample[k]['external_id'] + '...' + d_sample[k]['tissue']) else: lst.append(d_sample[k]['primary_id'] + '...' + d_sample[k]['tissue']) else: lst.append('.') lst.append( d_run[k]['experiment_id'] ) lst.append( d_run[k]['study_id'] ) # for column study_id_PRJ if k in d_study: lst.append( d_study[k]['primary_id'] ) else: lst.append( '.' ) lst.append( d_run[k]['title'] ) lst.append( d_run[k]['alias'] ) s = '' # description string if k in d_study: s += '

[Study title] ' + d_study[k]['title'] + '

[Study description] ' + d_study[k]['description'] #
is used for breaking lines in html if k in d_sample: s += '

[Sample title] ' + d_sample[k]['title'] + '

[Sample description] ' + d_sample[k]['description'] if k in d_experiment: s += '

[Experiment title] ' + d_experiment[k]['title'] + '

[Experiment description] ' + d_experiment[k]['description'] if s == '': s = '.' lst.append(s) lst.append(d_experiment[k]['library_strategy']) lst.append(d_experiment[k]['library_source']) print('%s' % ('\t'.join(lst))) # Make a json file as well. this file is used to display rna-seq information in scatterplots. json_dict = {} for k in sorted(d_run_keys): if k in d_run: s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:' if k in d_study: s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description'] if k in d_sample: s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description'] if k in d_experiment: s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description'] s = s.strip() d = {} d['tissue'] = get_tissue(s) d['library_strategy'] = d_experiment[k]['library_strategy'] d['library_source'] = d_experiment[k]['library_source'] d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...' json_dict[k] = d fname = '../Data/information/rnaseq_info_database.json.temp' with open(fname, 'w') as f: json.dump(json_dict, f, indent=4)