summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-06-03 16:50:29 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-06-03 16:50:29 +0800
commitdef5d8276577805f5c37bb5e694572646c71a120 (patch)
treefa3e2d2eef8c060b02eeaf5b1d7e476c957dcef3
parentc1dcc3ecf65c081468794eeb134c5fdad6fb4081 (diff)
Get more complete tissue information from ena_sample.xml
-rw-r--r--Code/parse_ena_xml.py28
1 files changed, 24 insertions, 4 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index f0ddfb3..cbabc3c 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -193,6 +193,12 @@ def parse_sample(fname):
else:
d2['external_id'] = '.'
+ acc = c.find('./IDENTIFIERS/SECONDARY_ID')
+ if acc != None:
+ secondary_id = acc.text
+ else:
+ secondary_id = 'None'
+
desc = c.find('DESCRIPTION')
d2['description'] = 'None'
if desc != None and desc.text != None:
@@ -214,6 +220,7 @@ def parse_sample(fname):
d2['tissue'] = clean_tissue_info(tissue_type) # remove space, lower letters, and remove punctuations
d[primary_id] = d2
+ d[secondary_id] = d2
return d
@@ -286,7 +293,7 @@ def parse_experiment(fname):
if desc != None and desc.text != None:
d2['description'] = desc.text
- sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID')
+ sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/PRIMARY_ID')
d2['sample_id'] = 'None'
if sample != None and sample.text != None:
d2['sample_id'] = sample.text
@@ -407,6 +414,7 @@ if __name__ == '__main__':
d_study = parse_study(ENA_RECORDS_STUDY) # STUDY
print(f'%% {ENA_RECORDS_STUDY}: {len(d_study)} entries')
+
cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
os.system(cmd)
@@ -450,20 +458,30 @@ if __name__ == '__main__':
json_dict = {}
count_transcriptomic = 0
count_tissue = 0
+ count_no_sample_id = 0
for k in sorted(d_run_keys):
+ # k - run id, k2 - experiment id, k3 = sample id, k4 - study id
d = {}
k2 = d_run[k]['experiment_id']
d['experiment_id'] = k2
d['tissue'] = ''
d['sample_id'] = d['study_id'] = d['library_strategy'] = d['library_source'] = d['detail'] = ''
+ k3 = k4 = 'None'
if k2 in d_experiment:
k3 = d_experiment[k2]['sample_id']
+ if k3 == 'None':
+ k3 = d_run[k]['sample_id']
+ count_no_sample_id += 1
k4 = d_experiment[k2]['study_id']
- d['sample_id'] = k3
- d['study_id'] = k4
- d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
d['library_strategy'] = d_experiment[k2]['library_strategy']
d['library_source'] = d_experiment[k2]['library_source']
+ else:
+ k3 = d_run[k]['sample_id']
+
+ d['sample_id'] = k3
+ d['study_id'] = k4
+ d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
+
json_dict[k] = d
if d['library_source'] == 'TRANSCRIPTOMIC':
count_transcriptomic += 1
@@ -473,6 +491,8 @@ if __name__ == '__main__':
percent = 100*count_tissue/count_transcriptomic
print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)')
+ print(f'%% Sample id not in d_experiment count: {count_no_sample_id}')
+
temp_fname = RNA_SEQ_INFO_DATABASE_JSON + '.temp'
with open(temp_fname, 'w') as f:
json.dump(json_dict, f, indent=4)