From 367285b4cf7edc639530e3fe17bbb390592b90a1 Mon Sep 17 00:00:00 2001 From: Lan Hui Date: Tue, 20 May 2025 15:38:13 +0800 Subject: Derive better tissue names --- Code/parse_ena_xml.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'Code') diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index eb1f610..6b78371 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -210,8 +210,8 @@ def parse_sample(fname): value = i.find('./VALUE') if 'tissue' in tag.text or 'organism part' in tag.text: #print(value.text) - tissue_type = value.text + ';' - d2['tissue'] = tissue_type.rstrip(';') + tissue_type += value.text + ' ' + d2['tissue'] = tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation)) # remove space, lower letters, and remove punctuations d[primary_id] = d2 @@ -319,18 +319,24 @@ def parse_experiment(fname): def get_singular_form(w): - d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'} + d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'} if w in d: return d[w] return w +def get_singular_form_for_several_words(s): + lst = s.split() + result = [get_singular_form(w) for w in lst] + return ' '.join(result) + + def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, study_id, d_study): ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. ''' tissue = '' if sample_id in d_sample: - tissue = d_sample[sample_id]['tissue'] + tissue = get_singular_form_for_several_words(d_sample[sample_id]['tissue']) if tissue: return tissue @@ -351,7 +357,7 @@ def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, s += ' ' + d_study[study_id]['title'] s += ' ' + d_study[study_id]['description'] - lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath', 'microspore'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py + lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'inflorescences', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath', 'microspore', 'siliques', 'silique'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py # build a count dictionary, where key is a word d = {} @@ -464,7 +470,8 @@ if __name__ == '__main__': # But make a backup for rnaseq_info_database.json first try: bak_fname = backup_file(RNA_SEQ_INFO_DATABASE_JSON) - shutil.move(temp_fname, RNA_SEQ_INFO_DATABASE_JSON) print(f'Made {bak_fname}') except Exception as e: - print(f'Backup {RNA_SEQ_INFO_DATABASE_JSON} encountered problem') + print(f'Backup {RNA_SEQ_INFO_DATABASE_JSON} encountered problem: {e}') + finally: + shutil.move(temp_fname, RNA_SEQ_INFO_DATABASE_JSON) -- cgit v1.2.1