summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-05-20 16:07:56 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-05-20 16:07:56 +0800
commitd72bea8b80cdb8a05b0fc28cb0d1f6421a9974ed (patch)
tree5f8e7705da653a7b49588f5cc266c2d392eb0d35
parent367285b4cf7edc639530e3fe17bbb390592b90a1 (diff)
Extract a function to handle tissue type info
-rw-r--r--Code/parse_ena_xml.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 6b78371..eea7029 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -211,7 +211,7 @@ def parse_sample(fname):
if 'tissue' in tag.text or 'organism part' in tag.text:
#print(value.text)
tissue_type += value.text + ' '
- d2['tissue'] = tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation)) # remove space, lower letters, and remove punctuations
+ d2['tissue'] = clean_tissue_info(tissue_type) # remove space, lower letters, and remove punctuations
d[primary_id] = d2
@@ -318,6 +318,15 @@ def parse_experiment(fname):
return d
+
+def clean_tissue_info(tissue_type):
+ if 'not provided' in tissue_type:
+ return ''
+ if 'seedings' in tissue_type: # a typo I guess
+ return 'seedlings'
+ return tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation))
+
+
def get_singular_form(w):
d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'}
if w in d: