diff options
-rw-r--r-- | Code/parse_ena_xml.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 6b78371..eea7029 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -211,7 +211,7 @@ def parse_sample(fname): if 'tissue' in tag.text or 'organism part' in tag.text: #print(value.text) tissue_type += value.text + ' ' - d2['tissue'] = tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation)) # remove space, lower letters, and remove punctuations + d2['tissue'] = clean_tissue_info(tissue_type) # remove space, lower letters, and remove punctuations d[primary_id] = d2 @@ -318,6 +318,15 @@ def parse_experiment(fname): return d + +def clean_tissue_info(tissue_type): + if 'not provided' in tissue_type: + return '' + if 'seedings' in tissue_type: # a typo I guess + return 'seedlings' + return tissue_type.strip().lower().translate(str.maketrans('', '', string.punctuation)) + + def get_singular_form(w): d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'cells':'cell', 'plants':'plant', 'siliques':'silique', 'organs':'organ', 'inflorescences':'inflorescence'} if w in d: |