From 0a09eb64a364a1bf67e06c77cce34672beecaec1 Mon Sep 17 00:00:00 2001 From: Lan Hui Date: Wed, 16 Apr 2025 15:48:13 +0800 Subject: Output some stats, beginning with %% --- Code/parse_ena_xml.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'Code') diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index c9faefe..8d45481 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -266,14 +266,13 @@ if __name__ == '__main__': # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT. So each # of the following function is call for each type of xml file. The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702 d_run = parse_run(ENA_RECORDS_READ_RUN) # RUN - print(f'{ENA_RECORDS_READ_RUN}: {len(d_run)} entries') + print(f'%% {ENA_RECORDS_READ_RUN}: {len(d_run)} entries') d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT) # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) - print(f'{ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries') - #print(d_experiment['ERX9699060']) + print(f'%% {ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries') d_sample = parse_sample(ENA_RECORDS_SAMPLE) # SAMPLE - print(f'{ENA_RECORDS_SAMPLE}: {len(d_sample)} entries') + print(f'%% {ENA_RECORDS_SAMPLE}: {len(d_sample)} entries') d_study = parse_study(ENA_RECORDS_STUDY) # STUDY - print(f'{ENA_RECORDS_STUDY}: {len(d_study)} entries') + print(f'%% {ENA_RECORDS_STUDY}: {len(d_study)} entries') cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection os.system(cmd) @@ -311,12 +310,14 @@ if __name__ == '__main__': # Make a json file as well. this file is used to display rna-seq information in scatterplots. json_dict = {} + count_transcriptomic = 0 + count_tissue = 0 for k in sorted(d_run_keys): d = {} k2 = d_run[k]['experiment_id'] k3 = d_experiment[k2]['sample_id'] if k2 in d_experiment else 'SAM_UNKNOWN' k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN' - d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.' + d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '' if k2 in d_experiment: d['sample_id'] = d_experiment[k2]['sample_id'] d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study) @@ -324,7 +325,13 @@ if __name__ == '__main__': d['library_source'] = d_experiment[k2]['library_source'] d['detail'] = 'TBA' json_dict[k] = d - print(d) + if d['library_source'] == 'TRANSCRIPTOMIC': + count_transcriptomic += 1 + if d['tissue']: + count_tissue += 1 + + percent = 100*count_tissue/count_transcriptomic + print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)') fname = '../Data/information/rnaseq_info_database.json.temp' with open(fname, 'w') as f: -- cgit v1.2.1