summaryrefslogtreecommitdiff
path: root/Code/parse_ena_xml.py
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-04-16 15:48:13 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-04-16 15:48:13 +0800
commit0a09eb64a364a1bf67e06c77cce34672beecaec1 (patch)
tree0d4b9f539042ed5d1dbc52199087b0755d82e34a /Code/parse_ena_xml.py
parent52a1fc0d30cd48c822ca3b0bde5a4cd5f7ed2ac2 (diff)
Output some stats, beginning with %%
Diffstat (limited to 'Code/parse_ena_xml.py')
-rw-r--r--Code/parse_ena_xml.py21
1 files changed, 14 insertions, 7 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index c9faefe..8d45481 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -266,14 +266,13 @@ if __name__ == '__main__':
# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT. So each
# of the following function is call for each type of xml file. The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702
d_run = parse_run(ENA_RECORDS_READ_RUN) # RUN
- print(f'{ENA_RECORDS_READ_RUN}: {len(d_run)} entries')
+ print(f'%% {ENA_RECORDS_READ_RUN}: {len(d_run)} entries')
d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT) # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc)
- print(f'{ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries')
- #print(d_experiment['ERX9699060'])
+ print(f'%% {ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries')
d_sample = parse_sample(ENA_RECORDS_SAMPLE) # SAMPLE
- print(f'{ENA_RECORDS_SAMPLE}: {len(d_sample)} entries')
+ print(f'%% {ENA_RECORDS_SAMPLE}: {len(d_sample)} entries')
d_study = parse_study(ENA_RECORDS_STUDY) # STUDY
- print(f'{ENA_RECORDS_STUDY}: {len(d_study)} entries')
+ print(f'%% {ENA_RECORDS_STUDY}: {len(d_study)} entries')
cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
os.system(cmd)
@@ -311,12 +310,14 @@ if __name__ == '__main__':
# Make a json file as well. this file is used to display rna-seq information in scatterplots.
json_dict = {}
+ count_transcriptomic = 0
+ count_tissue = 0
for k in sorted(d_run_keys):
d = {}
k2 = d_run[k]['experiment_id']
k3 = d_experiment[k2]['sample_id'] if k2 in d_experiment else 'SAM_UNKNOWN'
k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN'
- d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.'
+ d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = ''
if k2 in d_experiment:
d['sample_id'] = d_experiment[k2]['sample_id']
d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
@@ -324,7 +325,13 @@ if __name__ == '__main__':
d['library_source'] = d_experiment[k2]['library_source']
d['detail'] = 'TBA'
json_dict[k] = d
- print(d)
+ if d['library_source'] == 'TRANSCRIPTOMIC':
+ count_transcriptomic += 1
+ if d['tissue']:
+ count_tissue += 1
+
+ percent = 100*count_tissue/count_transcriptomic
+ print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)')
fname = '../Data/information/rnaseq_info_database.json.temp'
with open(fname, 'w') as f: