summaryrefslogtreecommitdiff
path: root/Code/parse_ena_xml.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/parse_ena_xml.py')
-rw-r--r--Code/parse_ena_xml.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 6b78371..eea7029 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -211,7 +211,7 @@ def parse_sample(fname):
if 'tissue' in tag.text or 'organism part' in tag.text:
#print(value.text)
tissue_type += value.text + ' '
- d2['tissue'] = tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation)) # remove space, lower letters, and remove punctuations
+ d2['tissue'] = clean_tissue_info(tissue_type) # remove space, lower letters, and remove punctuations
d[primary_id] = d2
@@ -318,6 +318,15 @@ def parse_experiment(fname):
return d
+
+def clean_tissue_info(tissue_type):
+ if 'not provided' in tissue_type:
+ return ''
+ if 'seedings' in tissue_type: # a typo I guess
+ return 'seedlings'
+ return tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation))
+
+
def get_singular_form(w):
d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'}
if w in d: