diff options
| author | Lan Hui <lanhui@zjnu.edu.cn> | 2025-04-16 15:48:13 +0800 | 
|---|---|---|
| committer | Lan Hui <lanhui@zjnu.edu.cn> | 2025-04-16 15:48:13 +0800 | 
| commit | 0a09eb64a364a1bf67e06c77cce34672beecaec1 (patch) | |
| tree | 0d4b9f539042ed5d1dbc52199087b0755d82e34a /Code/parse_ena_xml.py | |
| parent | 52a1fc0d30cd48c822ca3b0bde5a4cd5f7ed2ac2 (diff) | |
Output some stats, beginning with %%
Diffstat (limited to 'Code/parse_ena_xml.py')
| -rw-r--r-- | Code/parse_ena_xml.py | 21 | 
1 files changed, 14 insertions, 7 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index c9faefe..8d45481 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -266,14 +266,13 @@ if __name__ == '__main__':      # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT.  So each      # of the following function is call for each type of xml file.  The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702      d_run        = parse_run(ENA_RECORDS_READ_RUN)                   # RUN -    print(f'{ENA_RECORDS_READ_RUN}:  {len(d_run)} entries') +    print(f'%% {ENA_RECORDS_READ_RUN}:  {len(d_run)} entries')      d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT)     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) -    print(f'{ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries') -    #print(d_experiment['ERX9699060']) +    print(f'%% {ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries')      d_sample     = parse_sample(ENA_RECORDS_SAMPLE)                  # SAMPLE -    print(f'{ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries') +    print(f'%% {ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries')      d_study      = parse_study(ENA_RECORDS_STUDY)                    # STUDY -    print(f'{ENA_RECORDS_STUDY}:  {len(d_study)} entries') +    print(f'%% {ENA_RECORDS_STUDY}:  {len(d_study)} entries')      cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection      os.system(cmd) @@ -311,12 +310,14 @@ if __name__ == '__main__':      # Make a json file as well. this file is used to display rna-seq information in scatterplots.      json_dict = {} +    count_transcriptomic = 0 +    count_tissue = 0      for k in sorted(d_run_keys):          d = {}          k2 = d_run[k]['experiment_id']          k3 = d_experiment[k2]['sample_id'] if k2 in d_experiment else 'SAM_UNKNOWN'          k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN' -        d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.' +        d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = ''          if k2 in d_experiment:              d['sample_id'] = d_experiment[k2]['sample_id']              d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study) @@ -324,7 +325,13 @@ if __name__ == '__main__':              d['library_source'] = d_experiment[k2]['library_source']          d['detail'] = 'TBA'          json_dict[k] = d -        print(d) +        if d['library_source'] == 'TRANSCRIPTOMIC': +            count_transcriptomic += 1 +            if d['tissue']: +                count_tissue += 1 + +    percent = 100*count_tissue/count_transcriptomic +    print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)')      fname = '../Data/information/rnaseq_info_database.json.temp'      with open(fname, 'w') as f:  | 
