summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-06-04 15:05:23 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-06-04 15:05:23 +0800
commit5104dd7d301adbb1bc49f3fd0f384b6cffc1e591 (patch)
treead14b4c2d9aa137c2c1288ef7056a45c179b7527
parentdef5d8276577805f5c37bb5e694572646c71a120 (diff)
Make cleaner tissue names
-rw-r--r--Code/parse_ena_xml.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index cbabc3c..4d8dc1d 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -333,11 +333,11 @@ def clean_tissue_info(tissue_type):
return 'seedlings'
if 'rootstock' in tissue_type:
return 'root'
- return tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation))
+ return replace_punctuation_with_space(tissue_type.strip().lower())
def get_singular_form(w):
- d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'}
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence', 'rosettes':'rosette', 'protoplasts':'protoplast'}
if w in d:
return d[w]
return w
@@ -349,6 +349,10 @@ def get_singular_form_for_several_words(s):
return ' '.join(result)
+def replace_punctuation_with_space(s):
+ return s.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
+
+
def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, study_id, d_study):
''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. '''
@@ -380,9 +384,7 @@ def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample,
# build a count dictionary, where key is a word
d = {}
s = s.lower()
- s = s.replace('_', ' ')
- s = s.replace('-', ' ')
- s = s.translate(str.maketrans('', '', string.punctuation))
+ s = replace_punctuation_with_space(s)
wlst = s.split()
for w in wlst:
if w in lst: