From 7ef975cba6caf33dab08f3b57b2bb7f744739887 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 10 Feb 2021 14:02:23 +0800 Subject: parse_ena_xml.py: adapt the script to use the latest meta data. --- Code/parse_ena_xml.py | 203 +++++++++++++++++++++++--------------------------- 1 file changed, 95 insertions(+), 108 deletions(-) diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index b9d6905..4c54bc5 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -24,32 +24,13 @@ # 30 May 2017, slcu, hui # 01 Jun 2017, slcu, hui [added a column sample_id] # 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.] - +# 10 Feb 2021, zjnu, hui [download latest run info files: information/ena_3702_read_run.xml, ena_3702_study.xml, ena_3702_sample.xml, information/ena_3702_read_experiment.xml;do not use SraRunTable_Ath_Tax3702.txt and d_run2 anymore.] import os, json, re, operator import xml.etree.ElementTree import sys -MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file +MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file -def parse_SraRunTable(fname): - d = {} - f = open(fname) - lines = f.readlines() - f.close() - for line in lines: - line = line.strip() - if not line.startswith('#') and not line.startswith('Assay_Type_s') and line.lower().startswith('rna-seq'): - lst = line.split('\t') - acc = lst[17] - if not acc in d: - d[acc] = {} - d[acc]['experiment_id'] = lst[6] if lst[6] != '' else '.' - d[acc]['sample_id'] = (lst[4] + '...' + lst[18] + ' ' + lst[20]) if lst[4] != '' else '.' - d[acc]['study_id'] = lst[19] if lst[19] != '' else '.' - d[acc]['study_id_PRJ'] = lst[3] if lst[3] != '' else '.' - d[acc]['alias'] = lst[11] if lst[11] != '' else '.' - d[acc]['title'] = lst[20] if lst[20] != '' else '.' - return d def parse_run(fname): d = {} @@ -72,7 +53,6 @@ def parse_run(fname): d[acc]['study_id'] = '.' for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): s = i.text - #print(s) if 'RP' in s: # run project d[acc]['study_id'] = s break @@ -208,7 +188,18 @@ def parse_experiment(fname): d2['description'] = 'None' if desc != None and desc.text != None: d2['description'] = desc.text + + strategy = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY') + d2['library_strategy'] = 'None' # we look for RNA-Seq + if strategy != None and strategy.text != None: + d2['library_strategy'] = strategy.text + + source = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE') + d2['library_source'] = 'None!' + if source != None and source.text != None: + d2['library_source'] = source.text + run_id = '' for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'): s = i.text @@ -267,98 +258,94 @@ def get_tissue(s): return result.rstrip(';') -## main -# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each -# of the following function is for each type of xml file. -d_sample = parse_sample('../Data/information/ena_sample.xml') # SAMPLE. We can download RUN, STUDY, EXPERIMENT using download_ena_metadata.py, but not for SAMPLE (weired). So we need manually download ena_sample.xml. Enter http://www.ebi.ac.uk/ena/data/search?query=arabidopsis%20thaliana, click Sample (31,042) in the left panel of the displayed page, then click XML link in the right panel. The XML link is very small, so take time to find it or search for XML. -d_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN -d_run2 = parse_SraRunTable('../Data/information/SraRunTable_Ath_Tax3702.txt') # Go to https://www.ncbi.nlm.nih.gov/sra. Type (arabidopsis thaliana) AND "Arabidopsis thaliana"[orgn:__txid3702]. Click "Send results to Run selector". Click "RunInfo Table". Save SraRunTable.txt -d_study = parse_study('../Data/information/ena_rnaseq_read_study.xml') # STUDY -d_experiment = parse_experiment('../Data/information/ena_rnaseq_read_experiment.xml') # EXPERIMENT - - -cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection -os.system(cmd) - -print('%s' % ('\t'.join(['run_id', 'sample_id', 'experiment_id', 'study_id', 'study_id_PRJ', 'title', 'alias', 'description']))) # description comes from three sources, STUDY, SAMPLE and EXPERIMENT -d_run_keys = d_run.keys() -d_run_keys.extend(d_run2.keys()) -d_run_keys = list(set(d_run_keys)) -for k in sorted(d_run_keys): - lst = [] - lst.append(k) - if k in d_run: - if k in d_sample: - if d_sample[k]['external_id'] != '.': - lst.append(d_sample[k]['external_id'] + '...' + d_sample[k]['tissue']) - else: - lst.append(d_sample[k]['primary_id'] + '...' + d_sample[k]['tissue']) - else: - lst.append('.') - lst.append( d_run[k]['experiment_id']) - lst.append( d_run[k]['study_id'] ) - if k in d_study: - lst.append( d_study[k]['primary_id'] ) - else: - lst.append( '.' ) - lst.append( d_run[k]['title'] ) - lst.append( d_run[k]['alias'] ) - s = '' # description string - if k in d_study: - s += '

[Study title] ' + d_study[k]['title'] + '

[Study description] ' + d_study[k]['description'] #
is used for breaking lines in html +## main +if __name__ == '__main__': + + # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT. So each + # of the following function is call for each type of xml file. The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702 + d_run = parse_run('../Data/information/ena_3702_read_run.xml') # RUN + d_sample = parse_sample('../Data/information/ena_3702_sample.xml') # SAMPLE + d_study = parse_study('../Data/information/ena_3702_read_study.xml') # STUDY + d_experiment = parse_experiment('../Data/information/ena_3702_read_experiment.xml') # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) + + cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection + os.system(cmd) + + print('%s' % ('\t'.join(['run_id', 'sample_id', 'experiment_id', 'study_id', 'study_id_PRJ', 'title', 'alias', 'description', 'library_strategy', 'library_source']))) # description comes from three sources, STUDY, SAMPLE and EXPERIMENT - if k in d_sample: - s += '

[Sample title] ' + d_sample[k]['title'] + '

[Sample description] ' + d_sample[k]['description'] + d_run_keys = d_run.keys() + d_run_keys = list(set(d_run_keys)) + + # Collect information for each run ID + for k in sorted(d_run_keys): + lst = [] + lst.append(k) + if k in d_run: + if k in d_sample: + if d_sample[k]['external_id'] != '.': + lst.append(d_sample[k]['external_id'] + '...' + d_sample[k]['tissue']) + else: + lst.append(d_sample[k]['primary_id'] + '...' + d_sample[k]['tissue']) + else: + lst.append('.') - if k in d_experiment: - s += '

[Experiment title] ' + d_experiment[k]['title'] + '

[Experiment description] ' + d_experiment[k]['description'] + lst.append( d_run[k]['experiment_id'] ) + lst.append( d_run[k]['study_id'] ) - if s == '': - s = '.' + # for column study_id_PRJ + if k in d_study: + lst.append( d_study[k]['primary_id'] ) + else: + lst.append( '.' ) + lst.append( d_run[k]['title'] ) + lst.append( d_run[k]['alias'] ) + + s = '' # description string + + if k in d_study: + s += '

[Study title] ' + d_study[k]['title'] + '

[Study description] ' + d_study[k]['description'] #
is used for breaking lines in html + if k in d_sample: + s += '

[Sample title] ' + d_sample[k]['title'] + '

[Sample description] ' + d_sample[k]['description'] + + if k in d_experiment: + s += '

[Experiment title] ' + d_experiment[k]['title'] + '

[Experiment description] ' + d_experiment[k]['description'] + + if s == '': + s = '.' + + lst.append(s) + + lst.append(d_experiment[k]['library_strategy']) - lst.append(s) - elif k in d_run2: - lst.append(d_run2[k]['sample_id']) - lst.append(d_run2[k]['experiment_id']) - lst.append(d_run2[k]['study_id']) - lst.append(d_run2[k]['study_id_PRJ']) - lst.append(d_run2[k]['title']) - lst.append(d_run2[k]['alias']) - lst.append('.') + lst.append(d_experiment[k]['library_source']) + + print('%s' % ('\t'.join(lst))) - print('%s' % ('\t'.join(lst))) + + # Make a json file as well. this file is used to display rna-seq information in scatterplots. + json_dict = {} + for k in sorted(d_run_keys): + if k in d_run: + s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:' + if k in d_study: + s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description'] + if k in d_sample: + s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description'] + if k in d_experiment: + s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description'] + + s = s.strip() + d = {} + d['tissue'] = get_tissue(s) + d['library_strategy'] = d_experiment[k]['library_strategy'] + d['library_source'] = d_experiment[k]['library_source'] + d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...' + + json_dict[k] = d -# make a json file as well. this file is used to display rna-seq information in scatterplots. -json_dict = {} -for k in sorted(d_run_keys): - if k in d_run: - s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:' - if k in d_study: - s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description'] - if k in d_sample: - s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description'] - if k in d_experiment: - s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description'] - - s = s.strip() - d = {} - d['tissue'] = get_tissue(s) - d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...' - - elif k in d_run2: - s = d_run2[k]['title'] + ' ' + d_run2[k]['alias'] - s = s.strip() - d = {} - d['tissue'] = get_tissue(s) - d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...' - - json_dict[k] = d - -fname = '../Data/information/rnaseq_info_database.json' -with open(fname, 'w') as f: - json.dump(json_dict, f, indent=4) - -#sys.stderr.write('Check %s. Use this file to display RNA-seq information in the scatterplots. Copy it to Data/information and rename it to rnaseq_info_database.json.\n' % (fname)) + fname = '../Data/information/rnaseq_info_database.json.temp' + with open(fname, 'w') as f: + json.dump(json_dict, f, indent=4) -- cgit v1.2.1