# Usage: python parse_ena_xml.py > rnaseq_info_database.txt
#
# Search in this script for 'd_run', 'd_sample', 'd_experiment' and
# 'd_study', and set their input files.  The input files are generated
# by download_ena_metadata.py (except for d_sample).  It also
# generates a json file called info_database.json, for displaying
# experimental information in the scatterplot. If the input files are
# for RNA-seq data, rename info_database.json to
# rnaseq_info_database.json and move it to Data/information. Also move
# rnaseq_info_database.txt to Data/information.  They are used by
# html_network.py.
# 
# Purpose: Get description for RNA-seq data, one for each SRA Run ID.
# Make rnaseq_info_database.txt and rnaseq_info_database.json. Each
# line in rnaseq_info_database.txt contains information for a run id.
#
# NOTE: you might encounter UnicideEncodeError when running the
# program.  To avoid that, first type this command:
# export PYTHONIOENCODING=UTF-8.
#
# 22 Feb 2017, slcu, hui
# 12 Apr 2017, slcu, hui
# 20 Apr 2017, slcu, hui
# 30 May 2017, slcu, hui
# 01 Jun 2017, slcu, hui  [added a column sample_id]
# 19 Jun 2017, slcu, hui  [added SraRunTable_Ath_Tax3702.txt in d_run2.  Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.]
# 10 Feb 2021, zjnu, hui  [download latest run info files: information/ena_3702_read_run.xml, ena_3702_study.xml, ena_3702_sample.xml, information/ena_3702_read_experiment.xml;do not use SraRunTable_Ath_Tax3702.txt and d_run2 anymore.]
import os, json, re, operator
import xml.etree.ElementTree
import sys
import string
from configure import ENA_RECORDS_READ_RUN, ENA_RECORDS_READ_EXPERIMENT, ENA_RECORDS_SAMPLE, ENA_RECORDS_STUDY
MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file


def parse_run(fname):
    d = {}
    
    root = xml.etree.ElementTree.parse(fname).getroot()
    
    for c in root.findall('RUN'):
        acc = c.get('accession')
        d[acc] = {}
        
        alias = c.get('alias')
        d[acc]['alias'] = alias
        
        experiment = c.find('EXPERIMENT_REF').get('accession')
        d[acc]['experiment_id'] = experiment
        
        title = c.find('TITLE').text
        d[acc]['title'] = title
        
        d[acc]['study_id'] = '.'
        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
            s = i.text
            if 'RP' in s: # run project
                d[acc]['study_id'] = s
                break
        d[acc]['sample_id'] = '.'
        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
            s = i.text
            if 'RS' in s: # run project
                d[acc]['sample_id'] = s
                break

    return d


def parse_study(fname):
    d = {}
    root = xml.etree.ElementTree.parse(fname).getroot()

    for c in root.findall('PROJECT'):
        primary_id = c.get('accession')

        d2 = {}
        acc = c.find('./IDENTIFIERS/SECONDARY_ID')
        if acc != None:
            d2['secondary_id'] = acc.text
        else:
            d2['secondary_id'] = '.'
        
        desc = c.find('DESCRIPTION')
        d2['description'] = 'None'
        if desc != None:
            d2['description'] = desc.text        

        title = c.find('TITLE')
        d2['title'] = 'None'
        if title != None:
            d2['title'] = title.text

        d[primary_id] = d2

    return d


def parse_sample(fname):
    d = {}
    root = xml.etree.ElementTree.parse(fname).getroot()

    for c in root.findall('SAMPLE'):
        primary_id = c.get('accession')

        d2 = {}
        acc = c.find('./IDENTIFIERS/EXTERNAL_ID')
        if acc != None:
            d2['external_id'] = acc.text
        else:
            d2['external_id'] = '.'

        desc = c.find('DESCRIPTION')
        d2['description'] = 'None'
        if desc != None and desc.text != None:
            d2['description'] = desc.text

        title = c.find('TITLE')
        d2['title'] = 'None'
        if title != None and title.text != None:
            d2['title'] = title.text

        tissue_type = ''
        for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE'):
            #print(i)
            tag = i.find('./TAG')
            value = i.find('./VALUE')
            if 'tissue' in tag.text or 'organism part' in tag.text:
                #print(value.text)
                tissue_type = value.text + ';'
        d2['tissue'] = tissue_type.rstrip(';')

        d[primary_id] = d2

    return d


def parse_experiment(fname):
    d = {}
    
    root = xml.etree.ElementTree.parse(fname).getroot()
    
    for c in root.findall('EXPERIMENT'):
        primary_id = c.get('accession')

        d2 = {}

        study = c.find('./STUDY_REF/IDENTIFIERS/SECONDARY_ID')
        d2['study_id'] = 'None'
        if study != None and study.text != None:
            d2['study_id'] = study.text
        
        title = c.find('TITLE')
        d2['title'] = 'None'
        if title != None and title.text != None:
            d2['title'] = title.text

        desc = c.find('./DESIGN/DESIGN_DESCRIPTION')
        d2['description'] = 'None'
        if desc != None and desc.text != None:
            d2['description'] = desc.text

        sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID')
        d2['sample_id'] = 'None'
        if sample != None and sample.text != None:
            d2['sample_id'] = sample.text

        strategy = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY')
        d2['library_strategy'] = 'None'  # we look for RNA-Seq
        if strategy != None and strategy.text != None:
            d2['library_strategy'] = strategy.text

        source = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE')
        d2['library_source'] = 'None!'
        if source != None and source.text != None:
            d2['library_source'] = source.text

        protocol = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_CONSTRUCTION_PROTOCOL')
        d2['protocol'] = 'None!'
        if protocol != None and protocol.text != None:
            d2['protocol'] = protocol.text

        attribute = ''
        for i in c.findall('./EXPERIMENT_ATTRIBUTES/EXPERIMENT_ATTRIBUTE'):
            tag = i.find('./TAG')
            value = i.find('./VALUE')
            attribute += value.text + ' '
        d2['attribute'] = attribute

        d[primary_id] = d2

    return d


def get_singular_form(w):
    d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'}
    if w in d:
        return d[w]
    return w


def get_tissue(run_id, d_run, experiment_id, d_experiment, sample_id, d_sample, study_id, d_study):
    ''' Extract tissue name from s.  s may contain several tissue names, return them ordered by frequency.  '''

    tissue = ''
    if sample_id in d_sample:
        tissue = d_sample[sample_id]['tissue']
    if tissue:
        return tissue

    s = ''
    if sample_id in d_sample:
        s += ' ' + d_sample[sample_id]['title']
        s += ' ' + d_sample[sample_id]['description']

    if experiment_id in d_experiment:
        s += ' ' + d_experiment[experiment_id]['title']
        s += ' ' + d_experiment[experiment_id]['protocol']
        s += ' ' + d_experiment[experiment_id]['attribute']

    if run_id in d_run:
        s += ' ' + d_run[run_id]['title']

    if study_id in d_study:
        s += ' ' + d_study[study_id]['title']
        s += ' ' + d_study[study_id]['description']

    lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath', 'microspore'] # possible tissue names, lower case.  refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py

    # build a count dictionary, where key is a word
    d = {}
    s = s.lower()
    s = s.replace('_', ' ')
    s = s.replace('-', ' ')
    s = s.translate(str.maketrans('', '', string.punctuation))
    wlst = s.split()
    #wlst = re.sub("[^\w]", " ",  s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words
    for w in wlst:
        if w in lst:
            w2 = get_singular_form(w)
            if not w2 in d:
                d[w2] = 1
            else:
                d[w2] += 1

    result = ''
    if d:
        tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
        for t in tlst:
            result += '%s(%d);' % (t[0], t[1])
    return result.rstrip(';')


def get_tissue2(sample_id, d):
    tissue = ''
    #print(sample_id)
    #print(list(d.keys())[0:10])
    if sample_id in d:
        tissue = d[sample_id]['tissue']
    return tissue


## main
if __name__ == '__main__':
    
    # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT.  So each
    # of the following function is call for each type of xml file.  The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702
    d_run        = parse_run(ENA_RECORDS_READ_RUN)                   # RUN
    print(f'%% {ENA_RECORDS_READ_RUN}:  {len(d_run)} entries')
    d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT)     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc)
    print(f'%% {ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries')
    d_sample     = parse_sample(ENA_RECORDS_SAMPLE)                  # SAMPLE
    print(f'%% {ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries')
    d_study      = parse_study(ENA_RECORDS_STUDY)                    # STUDY
    print(f'%% {ENA_RECORDS_STUDY}:  {len(d_study)} entries')

    cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
    os.system(cmd)
    
    print('%s' % ('\t'.join(['run_id', 'sample_id', 'experiment_id', 'study_id', 'study_id_PRJ', 'title', 'alias', 'description', 'library_strategy', 'library_source']))) # description comes from three sources, STUDY, SAMPLE and EXPERIMENT

    d_run_keys = d_run.keys()
    d_run_keys = list(set(d_run_keys))
    
    for k in sorted(d_run_keys):
        lst = [k]
        sample_id = d_run[k]['sample_id']
        experiment_id = d_run[k]['experiment_id']
        study_id = d_run[k]['study_id']
        study_id_PRJ = '.'
        title = d_run[k]['title']
        alias = d_run[k]['alias']
        description = '.'
        library_strategy = '.'
        library_source = '.'
        if experiment_id in d_experiment:
            description = d_experiment[experiment_id]['description']
            library_strategy = d_experiment[experiment_id]['library_strategy']
            library_source = d_experiment[experiment_id]['library_source']
        lst.append(sample_id)
        lst.append(experiment_id)
        lst.append(study_id)
        lst.append(study_id_PRJ)
        lst.append(title)
        lst.append(alias)
        lst.append(description)
        lst.append(library_strategy)
        lst.append(library_source)
        print('%s' % ('\t'.join(lst)))

    # Make a json file as well. this file is used to display rna-seq information in scatterplots.
    json_dict = {}
    count_transcriptomic = 0
    count_tissue = 0
    for k in sorted(d_run_keys):
        d = {}
        k2 = d_run[k]['experiment_id']
        k3 = d_experiment[k2]['sample_id'] if k2 in d_experiment else 'SAM_UNKNOWN'
        k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN'
        d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = ''
        if k2 in d_experiment:
            d['sample_id'] = d_experiment[k2]['sample_id']
            d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
            d['library_strategy'] = d_experiment[k2]['library_strategy']
            d['library_source'] = d_experiment[k2]['library_source']
        d['detail'] = 'TBA'
        json_dict[k] = d
        if d['library_source'] == 'TRANSCRIPTOMIC':
            count_transcriptomic += 1
            if d['tissue']:
                count_tissue += 1

    percent = 100*count_tissue/count_transcriptomic
    print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)')

    fname = '../Data/information/rnaseq_info_database.json.temp'
    with open(fname, 'w') as f:
        json.dump(json_dict, f, indent=4)


    sys.exit()
    # Collect information for each run ID
    for k in sorted(d_run_keys):
        lst = []
        lst.append(k)    
        if k in d_run:
            if k in d_sample:
                if d_sample[k]['external_id'] != '.':
                    lst.append(d_sample[k]['external_id'] + '...' + d_sample[k]['tissue'])            
                else:
                    lst.append(d_sample[k]['primary_id'] + '...' + d_sample[k]['tissue'])
            else:
                lst.append('.')

            lst.append( d_run[k]['experiment_id'] )
            lst.append( d_run[k]['study_id'] )

            # for column study_id_PRJ
            if k in d_study:
                lst.append( d_study[k]['primary_id'] )
            else:
                lst.append( '.' )
            lst.append( d_run[k]['title'] )
            lst.append( d_run[k]['alias'] )
    
            s = '' # description string
    
            if k in d_study:
                s += ' <br><br>[Study title] '  + d_study[k]['title'] + ' <br><br>[Study description] ' + d_study[k]['description'] # <br> is used for breaking lines in html
            if k in d_sample:
                s += ' <br><br>[Sample title] ' + d_sample[k]['title'] + ' <br><br>[Sample description] ' + d_sample[k]['description']
    
            if k in d_experiment:
                s += ' <br><br>[Experiment title] ' + d_experiment[k]['title'] + ' <br><br>[Experiment description] ' + d_experiment[k]['description']
    
            if s == '':
                s = '.'
    
            lst.append(s)

            lst.append(d_experiment[k]['library_strategy'])

            lst.append(d_experiment[k]['library_source'])            
            
        print('%s' % ('\t'.join(lst)))