summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-04-22 16:25:28 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-04-22 16:25:28 +0800
commite3b6fc97ecc857e488525b23938c18d0c7cc9683 (patch)
treef9324da25f642681408dfc3d20cb09a27af64f0d
parent173bdebe94fa0b4890e5bd80148efa033bb3bab3 (diff)
Include XML tree structure for each ena data source, ena_*.xml
-rw-r--r--Code/parse_ena_xml.py123
1 files changed, 123 insertions, 0 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 8cb7946..eb1f610 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -37,6 +37,29 @@ MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file
def parse_run(fname):
+ '''
+ Each record has the following format:
+ (Use Data/temp/xmltreeview.py to get the following output)
+
+ 0: RUN_SET [-]
+ 1: RUN [accession, alias, broker_name, center_name, run_center, run_date]
+ 2: IDENTIFIERS [-]
+ 3: PRIMARY_ID [-]
+ 3: SUBMITTER_ID [namespace]
+ 2: TITLE [-]
+ 2: EXPERIMENT_REF [accession, refcenter, refname]
+ 2: RUN_LINKS [-]
+ 3: RUN_LINK [-]
+ 4: XREF_LINK [-]
+ 5: DB [-]
+ 5: ID [-]
+ 2: RUN_ATTRIBUTES [-]
+ 3: RUN_ATTRIBUTE [-]
+ 4: TAG [-]
+ 4: VALUE [-]
+
+ '''
+
d = {}
root = xml.etree.ElementTree.parse(fname).getroot()
@@ -71,6 +94,37 @@ def parse_run(fname):
def parse_study(fname):
+ '''
+ Each record has the following format:
+ (Use Data/temp/xmltreeview.py to get the following output)
+
+ 0: PROJECT_SET [-]
+ 1: PROJECT [accession, alias, center_name, broker_name]
+ 2: IDENTIFIERS [-]
+ 3: PRIMARY_ID [-]
+ 3: SUBMITTER_ID [namespace]
+ 2: NAME [-]
+ 2: TITLE [-]
+ 2: DESCRIPTION [-]
+ 2: SUBMISSION_PROJECT [-]
+ 3: SEQUENCING_PROJECT [-]
+ 3: ORGANISM [-]
+ 4: TAXON_ID [-]
+ 4: SCIENTIFIC_NAME [-]
+ 2: RELATED_PROJECTS [-]
+ 3: RELATED_PROJECT [-]
+ 4: PARENT_PROJECT [accession]
+ 2: PROJECT_LINKS [-]
+ 3: PROJECT_LINK [-]
+ 4: XREF_LINK [-]
+ 5: DB [-]
+ 5: ID [-]
+ 2: PROJECT_ATTRIBUTES [-]
+ 3: PROJECT_ATTRIBUTE [-]
+ 4: TAG [-]
+ 4: VALUE [-]
+ '''
+
d = {}
root = xml.etree.ElementTree.parse(fname).getroot()
@@ -100,6 +154,32 @@ def parse_study(fname):
def parse_sample(fname):
+ '''
+ Each record has the following format:
+ (Use Data/temp/xmltreeview.py to get the following output)
+
+ 0: SAMPLE_SET [-]
+ 1: SAMPLE [accession, alias, center_name, broker_name]
+ 2: IDENTIFIERS [-]
+ 3: PRIMARY_ID [-]
+ 3: SECONDARY_ID [-]
+ 3: EXTERNAL_ID [namespace]
+ 2: TITLE [-]
+ 2: SAMPLE_NAME [-]
+ 3: TAXON_ID [-]
+ 3: SCIENTIFIC_NAME [-]
+ 3: COMMON_NAME [-]
+ 2: SAMPLE_LINKS [-]
+ 3: SAMPLE_LINK [-]
+ 4: XREF_LINK [-]
+ 5: DB [-]
+ 5: ID [-]
+ 2: SAMPLE_ATTRIBUTES [-]
+ 3: SAMPLE_ATTRIBUTE [-]
+ 4: TAG [-]
+ 4: VALUE [-]
+ '''
+
d = {}
root = xml.etree.ElementTree.parse(fname).getroot()
@@ -139,6 +219,49 @@ def parse_sample(fname):
def parse_experiment(fname):
+ '''
+ Each record has the following format:
+ (Use Data/temp/xmltreeview.py to get the following output)
+
+ 0: EXPERIMENT_SET [-]
+ 1: EXPERIMENT [accession, alias, broker_name, center_name]
+ 2: IDENTIFIERS [-]
+ 3: PRIMARY_ID [-]
+ 3: SUBMITTER_ID [namespace]
+ 2: TITLE [-]
+ 2: STUDY_REF [accession]
+ 3: IDENTIFIERS [-]
+ 4: PRIMARY_ID [-]
+ 4: SECONDARY_ID [-]
+ 2: DESIGN [-]
+ 3: DESIGN_DESCRIPTION [-]
+ 3: SAMPLE_DESCRIPTOR [accession]
+ 4: IDENTIFIERS [-]
+ 5: PRIMARY_ID [-]
+ 5: EXTERNAL_ID [namespace]
+ 3: LIBRARY_DESCRIPTOR [-]
+ 4: LIBRARY_NAME [-]
+ 4: LIBRARY_STRATEGY [-]
+ 4: LIBRARY_SOURCE [-]
+ 4: LIBRARY_SELECTION [-]
+ 4: LIBRARY_LAYOUT [-]
+ 5: PAIRED [-]
+ 4: LIBRARY_CONSTRUCTION_PROTOCOL [-]
+ 2: PLATFORM [-]
+ 3: ILLUMINA [-]
+ 4: INSTRUMENT_MODEL [-]
+ 2: PROCESSING [-]
+ 2: EXPERIMENT_LINKS [-]
+ 3: EXPERIMENT_LINK [-]
+ 4: XREF_LINK [-]
+ 5: DB [-]
+ 5: ID [-]
+ 2: EXPERIMENT_ATTRIBUTES [-]
+ 3: EXPERIMENT_ATTRIBUTE [-]
+ 4: TAG [-]
+ 4: VALUE [-]
+ '''
+
d = {}
root = xml.etree.ElementTree.parse(fname).getroot()