summaryrefslogtreecommitdiff
path: root/Code/parse_ena_xml.py
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-04-16 14:39:34 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-04-16 14:39:34 +0800
commitfb2b2e547139739e183a797d4f092974ed82ae00 (patch)
tree48fa17583451520c84b5fab7953287531ad0698b /Code/parse_ena_xml.py
parente89be92379976b0d977e66068a14a3c69d229431 (diff)
Retrieve more tissue information from read_experiment
Diffstat (limited to 'Code/parse_ena_xml.py')
-rw-r--r--Code/parse_ena_xml.py39
1 files changed, 26 insertions, 13 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 0a08a7e..0055ec5 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -120,7 +120,7 @@ def parse_sample(fname):
if title != None and title.text != None:
d2['title'] = title.text
- tissue_type = 'Unknown'
+ tissue_type = ''
for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE'):
#print(i)
tag = i.find('./TAG')
@@ -170,6 +170,18 @@ def parse_experiment(fname):
if source != None and source.text != None:
d2['library_source'] = source.text
+ protocol = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_CONSTRUCTION_PROTOCOL')
+ d2['protocol'] = 'None!'
+ if protocol != None and protocol.text != None:
+ d2['protocol'] = protocol.text
+
+ attribute = ''
+ for i in c.findall('./EXPERIMENT_ATTRIBUTES/EXPERIMENT_ATTRIBUTE'):
+ tag = i.find('./TAG')
+ value = i.find('./VALUE')
+ attribute += value.text + ' '
+ d2['attribute'] = attribute
+
d[primary_id] = d2
return d
@@ -182,24 +194,25 @@ def get_singular_form(w):
return w
-def get_tissue(sample_id, d_sample):
+def get_tissue(sample_id, d_sample, experiment_id, d_experiment):
''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. '''
- tissue = 'Unknown'
- result0 = ''
- #print(sample_id)
- #print(list(d.keys())[0:10])
+ tissue = ''
if sample_id in d_sample:
tissue = d_sample[sample_id]['tissue']
- if tissue != 'Unknown':
- result0 = tissue
+ if tissue:
+ return tissue
s = ''
if sample_id in d_sample:
- s += d_sample[sample_id]['title']
- s += d_sample[sample_id]['description']
+ s += ' ' + d_sample[sample_id]['title']
+ s += ' ' + d_sample[sample_id]['description']
+
+ if experiment_id in d_experiment:
+ s += ' ' + d_experiment[experiment_id]['protocol']
+ s += ' ' + d_experiment[experiment_id]['attribute']
- lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
+ lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath', 'microspore'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
# build a count dictionary, where key is a word
d = {}
@@ -222,7 +235,7 @@ def get_tissue(sample_id, d_sample):
tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
for t in tlst:
result += '%s(%d);' % (t[0], t[1])
- return result0 + ';' + result.rstrip(';')
+ return result.rstrip(';')
def get_tissue2(sample_id, d):
@@ -291,7 +304,7 @@ if __name__ == '__main__':
d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.'
if k2 in d_experiment:
d['sample_id'] = d_experiment[k2]['sample_id']
- d['tissue'] = get_tissue(d_experiment[k2]['sample_id'], d_sample)
+ d['tissue'] = get_tissue(d_experiment[k2]['sample_id'], d_sample, k2, d_experiment)
d['library_strategy'] = d_experiment[k2]['library_strategy']
d['library_source'] = d_experiment[k2]['library_source']
d['detail'] = 'TBA'