diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2025-04-14 18:51:17 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2025-04-14 18:51:17 +0800 |
commit | b0a477d6f8a888832ae339593cf4bd0ef05a23df (patch) | |
tree | 2b0c5954dfa00c4372b761435e5821b3df290d47 /Code | |
parent | 7d161d428463ac865459c251a820d85085a2c5fb (diff) |
Futher improve XML processing logic
Diffstat (limited to 'Code')
-rw-r--r-- | Code/parse_ena_xml.py | 84 |
1 files changed, 58 insertions, 26 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 1614d7d..1bc2862 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -150,6 +150,11 @@ def parse_experiment(fname): if desc != None and desc.text != None: d2['description'] = desc.text + sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID/') + d2['sample_id'] = 'None' + if sample != None and sample.text != None: + d2['sample_id'] = sample.text + strategy = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY') d2['library_strategy'] = 'None' # we look for RNA-Seq if strategy != None and strategy.text != None: @@ -198,6 +203,13 @@ def get_tissue(s): return result.rstrip(';') +def get_tissue2(sample_id, d): + tissue = '.' + if sample_id in d: + tissue = d[sample_id]['tissue'] + return tissue + + ## main if __name__ == '__main__': @@ -221,6 +233,52 @@ if __name__ == '__main__': d_run_keys = d_run.keys() d_run_keys = list(set(d_run_keys)) + for k in sorted(d_run_keys): + lst = [k] + sample_id = d_run[k]['sample_id'] + experiment_id = d_run[k]['experiment_id'] + study_id = d_run[k]['study_id'] + study_id_PRJ = '.' + title = d_run[k]['title'] + alias = d_run[k]['alias'] + description = '.' + library_strategy = '.' + library_source = '.' + if experiment_id in d_experiment: + description = d_experiment[experiment_id]['description'] + library_strategy = d_experiment[experiment_id]['library_strategy'] + library_source = d_experiment[experiment_id]['library_source'] + lst.append(sample_id) + lst.append(experiment_id) + lst.append(study_id) + lst.append(study_id_PRJ) + lst.append(title) + lst.append(alias) + lst.append(description) + lst.append(library_strategy) + lst.append(library_source) + print('%s' % ('\t'.join(lst))) + + # Make a json file as well. this file is used to display rna-seq information in scatterplots. + json_dict = {} + for k in sorted(d_run_keys): + d = {} + k2 = d_run[k]['experiment_id'] + d['tissue'] = d['library_strategy'] = d['library_source'] = '.' + if k2 in d_experiment: + d['tissue'] = get_tissue2(d_experiment[k2]['sample_id'], d_sample) + d['library_strategy'] = d_experiment[k2]['library_strategy'] + d['library_source'] = d_experiment[k2]['library_source'] + d['detail'] = 'TBA' + json_dict[k] = d + print(d) + + fname = '../Data/information/rnaseq_info_database.json.temp' + with open(fname, 'w') as f: + json.dump(json_dict, f, indent=4) + + + sys.exit() # Collect information for each run ID for k in sorted(d_run_keys): lst = [] @@ -265,29 +323,3 @@ if __name__ == '__main__': lst.append(d_experiment[k]['library_source']) print('%s' % ('\t'.join(lst))) - - - # Make a json file as well. this file is used to display rna-seq information in scatterplots. - json_dict = {} - for k in sorted(d_run_keys): - if k in d_run: - s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:' - if k in d_study: - s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description'] - if k in d_sample: - s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description'] - if k in d_experiment: - s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description'] - - s = s.strip() - d = {} - d['tissue'] = get_tissue(s) - d['library_strategy'] = d_experiment[k]['library_strategy'] - d['library_source'] = d_experiment[k]['library_source'] - d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...' - - json_dict[k] = d - - fname = '../Data/information/rnaseq_info_database.json.temp' - with open(fname, 'w') as f: - json.dump(json_dict, f, indent=4) |