diff options
author | Lan Hui <lanhui@zjnu.edu.cn> | 2025-06-03 16:50:29 +0800 |
---|---|---|
committer | Lan Hui <lanhui@zjnu.edu.cn> | 2025-06-03 16:50:29 +0800 |
commit | def5d8276577805f5c37bb5e694572646c71a120 (patch) | |
tree | fa3e2d2eef8c060b02eeaf5b1d7e476c957dcef3 | |
parent | c1dcc3ecf65c081468794eeb134c5fdad6fb4081 (diff) |
Get more complete tissue information from ena_sample.xml
-rw-r--r-- | Code/parse_ena_xml.py | 28 |
1 files changed, 24 insertions, 4 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index f0ddfb3..cbabc3c 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -193,6 +193,12 @@ def parse_sample(fname): else: d2['external_id'] = '.' + acc = c.find('./IDENTIFIERS/SECONDARY_ID') + if acc != None: + secondary_id = acc.text + else: + secondary_id = 'None' + desc = c.find('DESCRIPTION') d2['description'] = 'None' if desc != None and desc.text != None: @@ -214,6 +220,7 @@ def parse_sample(fname): d2['tissue'] = clean_tissue_info(tissue_type) # remove space, lower letters, and remove punctuations d[primary_id] = d2 + d[secondary_id] = d2 return d @@ -286,7 +293,7 @@ def parse_experiment(fname): if desc != None and desc.text != None: d2['description'] = desc.text - sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID') + sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/PRIMARY_ID') d2['sample_id'] = 'None' if sample != None and sample.text != None: d2['sample_id'] = sample.text @@ -407,6 +414,7 @@ if __name__ == '__main__': d_study = parse_study(ENA_RECORDS_STUDY) # STUDY print(f'%% {ENA_RECORDS_STUDY}: {len(d_study)} entries') + cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection os.system(cmd) @@ -450,20 +458,30 @@ if __name__ == '__main__': json_dict = {} count_transcriptomic = 0 count_tissue = 0 + count_no_sample_id = 0 for k in sorted(d_run_keys): + # k - run id, k2 - experiment id, k3 = sample id, k4 - study id d = {} k2 = d_run[k]['experiment_id'] d['experiment_id'] = k2 d['tissue'] = '' d['sample_id'] = d['study_id'] = d['library_strategy'] = d['library_source'] = d['detail'] = '' + k3 = k4 = 'None' if k2 in d_experiment: k3 = d_experiment[k2]['sample_id'] + if k3 == 'None': + k3 = d_run[k]['sample_id'] + count_no_sample_id += 1 k4 = d_experiment[k2]['study_id'] - d['sample_id'] = k3 - d['study_id'] = k4 - d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study) d['library_strategy'] = d_experiment[k2]['library_strategy'] d['library_source'] = d_experiment[k2]['library_source'] + else: + k3 = d_run[k]['sample_id'] + + d['sample_id'] = k3 + d['study_id'] = k4 + d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study) + json_dict[k] = d if d['library_source'] == 'TRANSCRIPTOMIC': count_transcriptomic += 1 @@ -473,6 +491,8 @@ if __name__ == '__main__': percent = 100*count_tissue/count_transcriptomic print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)') + print(f'%% Sample id not in d_experiment count: {count_no_sample_id}') + temp_fname = RNA_SEQ_INFO_DATABASE_JSON + '.temp' with open(temp_fname, 'w') as f: json.dump(json_dict, f, indent=4) |