Make cleaner tissue names

author: Lan Hui <lanhui@zjnu.edu.cn> 2025-06-04 15:05:23 +0800
committer: Lan Hui <lanhui@zjnu.edu.cn> 2025-06-04 15:05:23 +0800
commit: 5104dd7d301adbb1bc49f3fd0f384b6cffc1e591 (patch)
tree: ad14b4c2d9aa137c2c1288ef7056a45c179b7527
parent: def5d8276577805f5c37bb5e694572646c71a120 (diff)
1 files changed, 7 insertions, 5 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index cbabc3c..4d8dc1d 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -333,11 +333,11 @@ def clean_tissue_info(tissue_type):
         return 'seedlings'
     if 'rootstock' in tissue_type:
         return 'root'
-    return tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation))
+    return replace_punctuation_with_space(tissue_type.strip().lower())
 
 
 def get_singular_form(w):
-    d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'}
+    d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence', 'rosettes':'rosette', 'protoplasts':'protoplast'}
     if w in d:
         return d[w]
     return w
@@ -349,6 +349,10 @@ def get_singular_form_for_several_words(s):
     return ' '.join(result)
 
 
+def replace_punctuation_with_space(s):
+    return s.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
+
+
 def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, study_id, d_study):
     ''' Extract tissue name from s.  s may contain several tissue names, return them ordered by frequency.  '''
 
@@ -380,9 +384,7 @@ def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample,
     # build a count dictionary, where key is a word
     d = {}
     s = s.lower()
-    s = s.replace('_', ' ')
-    s = s.replace('-', ' ')
-    s = s.translate(str.maketrans('', '', string.punctuation))
+    s = replace_punctuation_with_space(s)
     wlst = s.split()
     for w in wlst:
         if w in lst:
author	Lan Hui <lanhui@zjnu.edu.cn>	2025-06-04 15:05:23 +0800
committer	Lan Hui <lanhui@zjnu.edu.cn>	2025-06-04 15:05:23 +0800
commit	5104dd7d301adbb1bc49f3fd0f384b6cffc1e591 (patch)
tree	ad14b4c2d9aa137c2c1288ef7056a45c179b7527
parent	def5d8276577805f5c37bb5e694572646c71a120 (diff)