diff options
Diffstat (limited to 'Code/parse_ena_xml.py')
| -rw-r--r-- | Code/parse_ena_xml.py | 61 | 
1 files changed, 43 insertions, 18 deletions
| diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 1bc2862..0a08a7e 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -28,6 +28,7 @@  import os, json, re, operator  import xml.etree.ElementTree  import sys +import string  from configure import ENA_RECORDS_READ_RUN, ENA_RECORDS_READ_EXPERIMENT, ENA_RECORDS_SAMPLE, ENA_RECORDS_STUDY  MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file @@ -119,11 +120,15 @@ def parse_sample(fname):          if title != None and title.text != None:              d2['title'] = title.text -        tissue_type = '' -        for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'): -            if i != None and i.text != None: -                tissue_type += i.text + ' ' -        d2['tissue'] = tissue_type.strip() +        tissue_type = 'Unknown' +        for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE'): +            #print(i) +            tag = i.find('./TAG') +            value = i.find('./VALUE') +            if 'tissue' in tag.text or 'organism part' in tag.text: +                #print(value.text) +                tissue_type = value.text + ';' +        d2['tissue'] = tissue_type.rstrip(';')          d[primary_id] = d2 @@ -150,7 +155,7 @@ def parse_experiment(fname):          if desc != None and desc.text != None:              d2['description'] = desc.text -        sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID/') +        sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID')          d2['sample_id'] = 'None'          if sample != None and sample.text != None:              d2['sample_id'] = sample.text @@ -176,16 +181,34 @@ def get_singular_form(w):          return d[w]      return w -def get_tissue(s): + +def get_tissue(sample_id, d_sample):      ''' Extract tissue name from s.  s may contain several tissue names, return them ordered by frequency.  ''' +    tissue = 'Unknown' +    result0 = '' +    #print(sample_id) +    #print(list(d.keys())[0:10]) +    if sample_id in d_sample: +        tissue = d_sample[sample_id]['tissue'] +    if tissue != 'Unknown': +        result0 = tissue + +    s = '' +    if sample_id in d_sample: +        s += d_sample[sample_id]['title'] +        s += d_sample[sample_id]['description']      lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case.  refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py      # build a count dictionary, where key is a word      d = {}      s = s.lower() -    wlst = re.sub("[^\w]", " ",  s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words +    s = s.replace('_', ' ') +    s = s.replace('-', ' ') +    s = s.translate(str.maketrans('', '', string.punctuation)) +    wlst = s.split() +    #wlst = re.sub("[^\w]", " ",  s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words      for w in wlst:          if w in lst:              w2 = get_singular_form(w) @@ -193,18 +216,19 @@ def get_tissue(s):                  d[w2] = 1              else:                  d[w2] += 1 -    if len(d) == 0: -        return 'unknown' -     -    tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True) +      result = '' -    for t in tlst: -        result += '%s(%d);' % (t[0], t[1]) -    return result.rstrip(';') +    if d: +        tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True) +        for t in tlst: +            result += '%s(%d);' % (t[0], t[1]) +    return result0 + ';' + result.rstrip(';')  def get_tissue2(sample_id, d): -    tissue = '.' +    tissue = '' +    #print(sample_id) +    #print(list(d.keys())[0:10])      if sample_id in d:          tissue = d[sample_id]['tissue']      return tissue @@ -264,9 +288,10 @@ if __name__ == '__main__':      for k in sorted(d_run_keys):          d = {}          k2 = d_run[k]['experiment_id'] -        d['tissue'] = d['library_strategy'] = d['library_source'] = '.' +        d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.'          if k2 in d_experiment: -            d['tissue'] = get_tissue2(d_experiment[k2]['sample_id'], d_sample) +            d['sample_id'] = d_experiment[k2]['sample_id'] +            d['tissue'] = get_tissue(d_experiment[k2]['sample_id'], d_sample)              d['library_strategy'] = d_experiment[k2]['library_strategy']              d['library_source'] = d_experiment[k2]['library_source']          d['detail'] = 'TBA' | 
