diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/parse_ena_xml_test.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/parse_ena_xml_test.py')
-rw-r--r-- | Code/parse_ena_xml_test.py | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/Code/parse_ena_xml_test.py b/Code/parse_ena_xml_test.py new file mode 100644 index 0000000..c12c580 --- /dev/null +++ b/Code/parse_ena_xml_test.py @@ -0,0 +1,307 @@ +# Usage: python parse_ena_xml.py > rnaseq_info_database.txt +# +# Search in this script for 'd_run', 'd_sample', 'd_experiment' and +# 'd_study', and set their input files. The input files are generated +# by download_ena_metadata.py (except for d_sample). It also +# generates a json file called info_database.json, for displaying +# experimental information in the scatterplot. If the input files are +# for RNA-seq data, rename info_database.json to +# rnaseq_info_database.json and move it to Data/information. Also move +# rnaseq_info_database.txt to Data/information. They are used by +# html_network.py. +# +# Purpose: Get description for RNA-seq data, one for each SRA Run ID. +# Make rnaseq_info_database.txt and rnaseq_info_database.json. Each +# line in rnaseq_info_database.txt contains information for a run id. +# +# NOTE: you might encounter UnicideEncodeError when running the +# program. To avoid that, first type this command: +# export PYTHONIOENCODING=UTF-8. +# +# 22 Feb 2017, slcu, hui +# 12 Apr 2017, slcu, hui +# 20 Apr 2017, slcu, hui +# 30 May 2017, slcu, hui +# 01 Jun 2017, slcu, hui [added a column sample_id] +# 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.] + +import os, json, re, operator +import xml.etree.ElementTree +import sys + +MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file + +def parse_SraRunTable(fname): + d = {} + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if not line.startswith('#') and not line.startswith('Assay_Type_s') and line.lower().startswith('rna-seq'): + lst = line.split('\t') + acc = lst[17] + if not acc in d: + d[acc] = {} + d[acc]['experiment_id'] = lst[6] if lst[6] != '' else '.' + d[acc]['sample_id'] = (lst[4] + '...' + lst[18] + ' ' + lst[20]) if lst[4] != '' else '.' + d[acc]['study_id'] = lst[19] if lst[19] != '' else '.' + d[acc]['study_id_PRJ'] = lst[3] if lst[3] != '' else '.' + d[acc]['alias'] = lst[11] if lst[11] != '' else '.' + d[acc]['title'] = lst[20] if lst[20] != '' else '.' + return d + +def parse_run(fname): + d = {} + + root = xml.etree.ElementTree.parse(fname).getroot() + + for c in root.findall('RUN'): + acc = c.get('accession') + d[acc] = {} + + alias = c.get('alias') + d[acc]['alias'] = alias + + experiment = c.find('EXPERIMENT_REF').get('accession') + d[acc]['experiment_id'] = experiment + + title = c.find('TITLE').text + d[acc]['title'] = title + + d[acc]['study_id'] = '.' + for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): + s = i.text + #print(s) + if 'RP' in s: # run project + d[acc]['study_id'] = s + break + d[acc]['sample_id'] = '.' + for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): + s = i.text + if 'RS' in s: # run project + d[acc]['sample_id'] = s + break + + return d + + +def parse_study(fname): + d = {} + root = xml.etree.ElementTree.parse(fname).getroot() + + + for c in root.findall('PROJECT'): + d2 = {} + acc = c.find('./IDENTIFIERS/SECONDARY_ID') + if acc != None: + d2['secondary_id'] = acc.text + else: + d2['secondary_id'] = '.' + d2['primary_id'] = c.get('accession') + + desc = c.find('DESCRIPTION') + d2['description'] = 'None' + if desc != None: + d2['description'] = desc.text + + title = c.find('TITLE') + d2['title'] = 'None' + if title != None: + d2['title'] = title.text + + run_id = '' + for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'): + s = i.text + if 'RR' in s: + run_id = s; + break + lst = run_id.split(',') + for x in lst: + lst2 = x.split('-') + if len(lst2) == 1 and lst2[0] != '': + k = lst2[0] + d[k] = d2 # k is run id, such as SRR, ERR or DRR + elif len(lst2) == 2: + ss = lst2[0] + ee = lst2[1] + first_three_letters = ss[0:3] + sz = len(ss) - 3 + ss_t = int(ss[3:]) + ee_t = int(ee[3:]) + for j in range(ss_t, ee_t+1, 1): + k = first_three_letters + str(j).zfill(sz) + d[k] = d2 + return d + + +def parse_sample(fname): + d = {} + root = xml.etree.ElementTree.parse(fname).getroot() + + + for c in root.findall('SAMPLE'): + d2 = {} + acc = c.find('./IDENTIFIERS/EXTERNAL_ID') + if acc != None: + d2['external_id'] = acc.text + else: + d2['external_id'] = '.' + d2['primary_id'] = c.get('accession') + + desc = c.find('DESCRIPTION') + d2['description'] = 'None' + if desc != None and desc.text != None: + d2['description'] = desc.text + + title = c.find('TITLE') + d2['title'] = 'None' + if title != None and title.text != None: + d2['title'] = title.text + + tissue_type = '' + for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'): + if i != None and i.text != None: + tissue_type += i.text + ' ' + d2['tissue'] = tissue_type.strip() + + run_id = '' + for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'): + s = i.text + if 'RR' in s: + run_id = s; + break + lst = run_id.split(',') + for x in lst: + lst2 = x.split('-') # e.g., SRR520490-SRR520491 + if len(lst2) == 1 and lst2[0] != '': + k = lst2[0] + d[k] = d2 # k is run id, such as SRR, ERR or DRR + elif len(lst2) == 2: + ss = lst2[0] + ee = lst2[1] + first_three_letters = ss[0:3] + sz = len(ss) - 3 + ss_t = int(ss[3:]) + ee_t = int(ee[3:]) + for j in range(ss_t, ee_t+1, 1): + k = first_three_letters + str(j).zfill(sz) + d[k] = d2 + return d + + +def parse_experiment(fname): + d = {} + + root = xml.etree.ElementTree.parse(fname).getroot() + + for c in root.findall('EXPERIMENT'): + d2 = {} + d2['primary_id'] = c.get('accession') + + title = c.find('TITLE') + d2['title'] = 'None' + if title != None and title.text != None: + d2['title'] = title.text + + desc = c.find('./DESIGN/DESIGN_DESCRIPTION') + d2['description'] = 'None' + if desc != None and desc.text != None: + d2['description'] = desc.text + + run_id = '' + for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'): + s = i.text + if 'RR' in s: + run_id = s; + break + lst = run_id.split(',') + for x in lst: + lst2 = x.split('-') # e.g., SRR520490-SRR520491 + if len(lst2) == 1 and lst2[0] != '': + k = lst2[0] + d[k] = d2 # k is run id, such as SRR, ERR or DRR + elif len(lst2) == 2: + ss = lst2[0] + ee = lst2[1] + first_three_letters = ss[0:3] + sz = len(ss) - 3 + ss_t = int(ss[3:]) + ee_t = int(ee[3:]) + for j in range(ss_t, ee_t+1, 1): + k = first_three_letters + str(j).zfill(sz) + d[k] = d2 + return d + + +def get_singular_form(w): + d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'} + if w in d: + return d[w] + return w + +def get_tissue(s): + ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. ''' + + + lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py + + # build a count dictionary, where key is a word + d = {} + s = s.lower() + wlst = re.sub("[^\w]", " ", s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words + for w in wlst: + if w in lst: + w2 = get_singular_form(w) + if not w2 in d: + d[w2] = 1 + else: + d[w2] += 1 + if len(d) == 0: + return 'unknown' + + tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True) + result = '' + for t in tlst: + result += '%s(%d);' % (t[0], t[1]) + return result.rstrip(';') + + +## main + +# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each +# of the following function is for each type of xml file. +d_run = parse_run('ena_rnaseq_read_run.xml') # RUN + +cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection +os.system(cmd) + +d_run_keys = d_run.keys() +d_run_keys = list(set(d_run_keys)) +for k in sorted(d_run_keys): + lst = [] + lst.append(k) + if k in d_run and 'illumina hiseq' in d_run[k]['title'].lower() and 'rna-seq' in d_run[k]['title'].lower(): + lst.append( d_run[k]['experiment_id']) + lst.append( d_run[k]['study_id'] ) + lst.append( d_run[k]['title'] ) + lst.append( d_run[k]['alias'] ) + print('\t'.join(lst)) + +# make a json file as well. this file is used to display rna-seq information in scatterplots. +json_dict = {} +for k in sorted(d_run_keys): + if k in d_run and 'illumina hiseq' in d_run[k]['title'].lower() and 'rna-seq' in d_run[k]['title'].lower(): + s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:' + s = s.strip() + d = {} + d['tissue'] = get_tissue(s) + d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...' + + json_dict[k] = d + +fname = 'rnaseq_info_database.json' +with open(fname, 'w') as f: + json.dump(json_dict, f, indent=4) + +#sys.stderr.write('Check %s. Use this file to display RNA-seq information in the scatterplots. Copy it to Data/information and rename it to rnaseq_info_database.json.\n' % (fname)) |