diff options
author | Lan Hui <lanhui@zjnu.edu.cn> | 2025-04-16 15:48:13 +0800 |
---|---|---|
committer | Lan Hui <lanhui@zjnu.edu.cn> | 2025-04-16 15:48:13 +0800 |
commit | 0a09eb64a364a1bf67e06c77cce34672beecaec1 (patch) | |
tree | 0d4b9f539042ed5d1dbc52199087b0755d82e34a /Code/parse_ena_xml.py | |
parent | 52a1fc0d30cd48c822ca3b0bde5a4cd5f7ed2ac2 (diff) |
Output some stats, beginning with %%
Diffstat (limited to 'Code/parse_ena_xml.py')
-rw-r--r-- | Code/parse_ena_xml.py | 21 |
1 files changed, 14 insertions, 7 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index c9faefe..8d45481 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -266,14 +266,13 @@ if __name__ == '__main__': # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT. So each # of the following function is call for each type of xml file. The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702 d_run = parse_run(ENA_RECORDS_READ_RUN) # RUN - print(f'{ENA_RECORDS_READ_RUN}: {len(d_run)} entries') + print(f'%% {ENA_RECORDS_READ_RUN}: {len(d_run)} entries') d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT) # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) - print(f'{ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries') - #print(d_experiment['ERX9699060']) + print(f'%% {ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries') d_sample = parse_sample(ENA_RECORDS_SAMPLE) # SAMPLE - print(f'{ENA_RECORDS_SAMPLE}: {len(d_sample)} entries') + print(f'%% {ENA_RECORDS_SAMPLE}: {len(d_sample)} entries') d_study = parse_study(ENA_RECORDS_STUDY) # STUDY - print(f'{ENA_RECORDS_STUDY}: {len(d_study)} entries') + print(f'%% {ENA_RECORDS_STUDY}: {len(d_study)} entries') cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection os.system(cmd) @@ -311,12 +310,14 @@ if __name__ == '__main__': # Make a json file as well. this file is used to display rna-seq information in scatterplots. json_dict = {} + count_transcriptomic = 0 + count_tissue = 0 for k in sorted(d_run_keys): d = {} k2 = d_run[k]['experiment_id'] k3 = d_experiment[k2]['sample_id'] if k2 in d_experiment else 'SAM_UNKNOWN' k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN' - d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.' + d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '' if k2 in d_experiment: d['sample_id'] = d_experiment[k2]['sample_id'] d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study) @@ -324,7 +325,13 @@ if __name__ == '__main__': d['library_source'] = d_experiment[k2]['library_source'] d['detail'] = 'TBA' json_dict[k] = d - print(d) + if d['library_source'] == 'TRANSCRIPTOMIC': + count_transcriptomic += 1 + if d['tissue']: + count_tissue += 1 + + percent = 100*count_tissue/count_transcriptomic + print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)') fname = '../Data/information/rnaseq_info_database.json.temp' with open(fname, 'w') as f: |