diff options
author | Lan Hui <lanhui@zjnu.edu.cn> | 2025-06-04 15:05:23 +0800 |
---|---|---|
committer | Lan Hui <lanhui@zjnu.edu.cn> | 2025-06-04 15:05:23 +0800 |
commit | 5104dd7d301adbb1bc49f3fd0f384b6cffc1e591 (patch) | |
tree | ad14b4c2d9aa137c2c1288ef7056a45c179b7527 | |
parent | def5d8276577805f5c37bb5e694572646c71a120 (diff) |
Make cleaner tissue names
-rw-r--r-- | Code/parse_ena_xml.py | 12 |
1 files changed, 7 insertions, 5 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index cbabc3c..4d8dc1d 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -333,11 +333,11 @@ def clean_tissue_info(tissue_type): return 'seedlings' if 'rootstock' in tissue_type: return 'root' - return tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation)) + return replace_punctuation_with_space(tissue_type.strip().lower()) def get_singular_form(w): - d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'} + d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence', 'rosettes':'rosette', 'protoplasts':'protoplast'} if w in d: return d[w] return w @@ -349,6 +349,10 @@ def get_singular_form_for_several_words(s): return ' '.join(result) +def replace_punctuation_with_space(s): + return s.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) + + def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, study_id, d_study): ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. ''' @@ -380,9 +384,7 @@ def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, # build a count dictionary, where key is a word d = {} s = s.lower() - s = s.replace('_', ' ') - s = s.replace('-', ' ') - s = s.translate(str.maketrans('', '', string.punctuation)) + s = replace_punctuation_with_space(s) wlst = s.split() for w in wlst: if w in lst: |