# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt # # Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py. # # Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8. # # 13 Apr 2017, slcu, hui import os, json, re, operator, sys import xml.etree.ElementTree MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file def parse_run(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() for c in root.findall('RUN'): acc = c.get('accession') d[acc] = {} alias = c.get('alias') d[acc]['alias'] = alias experiment = c.find('EXPERIMENT_REF').get('accession') d[acc]['experiment_id'] = experiment title = c.find('TITLE').text d[acc]['title'] = title d[acc]['study_id'] = '.' for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): s = i.text #print(s) if 'RP' in s: # run project d[acc]['study_id'] = s break d[acc]['sample_id'] = '.' for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): s = i.text if 'RS' in s: # run project d[acc]['sample_id'] = s break return d def get_key(s): if '_' in s: return s[:s.find('_')] else: return s[:s.find('.')] def make_downloaded_dict(fname): f = open(fname) d = {} for line in f: line = line.strip() if line != '' and not line.startswith('#'): if line.startswith('-'): fn = line.split(' ')[-1] else: fn = os.path.basename(line) k = get_key(fn) if not k in d: d[k] = [fn] else: d[k].append(fn) d[k] = sorted(list(set(d[k]))) return d ## main cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection os.system(cmd) d = make_downloaded_dict(sys.argv[1]) # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each # of the following function is for each type of xml file. d_rnaseq_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN d_chipseq_run = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN for k in sorted(d.keys()): s = k + '\t' + ' '.join(d[k]) if k in d_rnaseq_run: s += '\t' + 'RNA-seq' elif k in d_chipseq_run: s += '\t' + 'ChIP-seq' else: s += '\t' + '.' print(s)