diff options
author | Lan Hui <lanhui@zjnu.edu.cn> | 2025-04-22 16:25:28 +0800 |
---|---|---|
committer | Lan Hui <lanhui@zjnu.edu.cn> | 2025-04-22 16:25:28 +0800 |
commit | e3b6fc97ecc857e488525b23938c18d0c7cc9683 (patch) | |
tree | f9324da25f642681408dfc3d20cb09a27af64f0d | |
parent | 173bdebe94fa0b4890e5bd80148efa033bb3bab3 (diff) |
Include XML tree structure for each ena data source, ena_*.xml
-rw-r--r-- | Code/parse_ena_xml.py | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 8cb7946..eb1f610 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -37,6 +37,29 @@ MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file def parse_run(fname): + ''' + Each record has the following format: + (Use Data/temp/xmltreeview.py to get the following output) + + 0: RUN_SET [-] + 1: RUN [accession, alias, broker_name, center_name, run_center, run_date] + 2: IDENTIFIERS [-] + 3: PRIMARY_ID [-] + 3: SUBMITTER_ID [namespace] + 2: TITLE [-] + 2: EXPERIMENT_REF [accession, refcenter, refname] + 2: RUN_LINKS [-] + 3: RUN_LINK [-] + 4: XREF_LINK [-] + 5: DB [-] + 5: ID [-] + 2: RUN_ATTRIBUTES [-] + 3: RUN_ATTRIBUTE [-] + 4: TAG [-] + 4: VALUE [-] + + ''' + d = {} root = xml.etree.ElementTree.parse(fname).getroot() @@ -71,6 +94,37 @@ def parse_run(fname): def parse_study(fname): + ''' + Each record has the following format: + (Use Data/temp/xmltreeview.py to get the following output) + + 0: PROJECT_SET [-] + 1: PROJECT [accession, alias, center_name, broker_name] + 2: IDENTIFIERS [-] + 3: PRIMARY_ID [-] + 3: SUBMITTER_ID [namespace] + 2: NAME [-] + 2: TITLE [-] + 2: DESCRIPTION [-] + 2: SUBMISSION_PROJECT [-] + 3: SEQUENCING_PROJECT [-] + 3: ORGANISM [-] + 4: TAXON_ID [-] + 4: SCIENTIFIC_NAME [-] + 2: RELATED_PROJECTS [-] + 3: RELATED_PROJECT [-] + 4: PARENT_PROJECT [accession] + 2: PROJECT_LINKS [-] + 3: PROJECT_LINK [-] + 4: XREF_LINK [-] + 5: DB [-] + 5: ID [-] + 2: PROJECT_ATTRIBUTES [-] + 3: PROJECT_ATTRIBUTE [-] + 4: TAG [-] + 4: VALUE [-] + ''' + d = {} root = xml.etree.ElementTree.parse(fname).getroot() @@ -100,6 +154,32 @@ def parse_study(fname): def parse_sample(fname): + ''' + Each record has the following format: + (Use Data/temp/xmltreeview.py to get the following output) + + 0: SAMPLE_SET [-] + 1: SAMPLE [accession, alias, center_name, broker_name] + 2: IDENTIFIERS [-] + 3: PRIMARY_ID [-] + 3: SECONDARY_ID [-] + 3: EXTERNAL_ID [namespace] + 2: TITLE [-] + 2: SAMPLE_NAME [-] + 3: TAXON_ID [-] + 3: SCIENTIFIC_NAME [-] + 3: COMMON_NAME [-] + 2: SAMPLE_LINKS [-] + 3: SAMPLE_LINK [-] + 4: XREF_LINK [-] + 5: DB [-] + 5: ID [-] + 2: SAMPLE_ATTRIBUTES [-] + 3: SAMPLE_ATTRIBUTE [-] + 4: TAG [-] + 4: VALUE [-] + ''' + d = {} root = xml.etree.ElementTree.parse(fname).getroot() @@ -139,6 +219,49 @@ def parse_sample(fname): def parse_experiment(fname): + ''' + Each record has the following format: + (Use Data/temp/xmltreeview.py to get the following output) + + 0: EXPERIMENT_SET [-] + 1: EXPERIMENT [accession, alias, broker_name, center_name] + 2: IDENTIFIERS [-] + 3: PRIMARY_ID [-] + 3: SUBMITTER_ID [namespace] + 2: TITLE [-] + 2: STUDY_REF [accession] + 3: IDENTIFIERS [-] + 4: PRIMARY_ID [-] + 4: SECONDARY_ID [-] + 2: DESIGN [-] + 3: DESIGN_DESCRIPTION [-] + 3: SAMPLE_DESCRIPTOR [accession] + 4: IDENTIFIERS [-] + 5: PRIMARY_ID [-] + 5: EXTERNAL_ID [namespace] + 3: LIBRARY_DESCRIPTOR [-] + 4: LIBRARY_NAME [-] + 4: LIBRARY_STRATEGY [-] + 4: LIBRARY_SOURCE [-] + 4: LIBRARY_SELECTION [-] + 4: LIBRARY_LAYOUT [-] + 5: PAIRED [-] + 4: LIBRARY_CONSTRUCTION_PROTOCOL [-] + 2: PLATFORM [-] + 3: ILLUMINA [-] + 4: INSTRUMENT_MODEL [-] + 2: PROCESSING [-] + 2: EXPERIMENT_LINKS [-] + 3: EXPERIMENT_LINK [-] + 4: XREF_LINK [-] + 5: DB [-] + 5: ID [-] + 2: EXPERIMENT_ATTRIBUTES [-] + 3: EXPERIMENT_ATTRIBUTE [-] + 4: TAG [-] + 4: VALUE [-] + ''' + d = {} root = xml.etree.ElementTree.parse(fname).getroot() |