# Usage: python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt
#        Set TPM_FILE = , d =  and d2 =
#
# Purpose: for each RNA-seq column in the TPM_FILE, get its tissue information
# Excute this command to avoid encoding error:  export PYTHONIOENCODING=UTF-8
#
# 2 June 2017, slcu, hui
# Last modified 19 June 2017, slcu, hui

import os, sys, json
import urllib2

def make_tissue_dict(fname):
    f = open(fname)
    d = json.load(f)
    f.close()
    return d

def get_experiment_id(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    result = []
    for line in lines[1:]:
        line = line.strip()
        lst = line.split('\t')        
        if len(lst) >= 2:
            result.append(lst[1])
    return result

def get_sra_id(x):
    if 'RR' in x:
        index1 = x.find('RR')
        index2 = x.find('X')
        if index2 == -1:
            index2 = len(x)
        return x[index1-1:index2]
    return x

def make_sample_dict(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    for line in lines[1:]:
        line = line.strip()
        lst = line.split('\t')
        if len(lst) >= 2:
            runid = lst[0]
            sample = lst[1]
            d[runid] = (sample, ';'.join(lst[2:]))
    return d


def stringfy_json(d):
    s = ''

    if "organismPart" in d["characteristics"]:
        s =  d["characteristics"]["organismPart"][0]["text"]
    else:
        s = 'part_unknown'
    
    if "accession" in d:
        s += '\t' + d["accession"]
    else:
        s += '\t' + '.'

    if "name" in d:
        s += '\t' + d["name"]
    else:
        s += '\t' + '.'

    if "synonym" in d["characteristics"]:
        s +=  '\t' + d["characteristics"]["synonym"][0]["text"]
    else:
        s += '\t' + '.'

    if "ecotype" in d["characteristics"]:
        s +=  '\t' + d["characteristics"]["ecotype"][0]["text"]
    else:
        s += '\t' + '.'
        
    if "developmentStage" in d["characteristics"]:
        s +=  '\t' + d["characteristics"]["developmentStage"][0]["text"]
    else:
        s += '\t' + '.'
        
    if "description" in d:
        s += '\t' + d["description"] if d["description"] != None else '.'
    else:
        s += '\t' + '.'
        
    return s

def make_information(s, info_dir):
    lst = s.split('...')
    sample_id = lst[0]
    filename = '%s/%s.json' % (info_dir, sample_id)
    #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id)
    if not os.path.exists(filename):
        cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)        
        os.system(cmd)

    f = open(filename)
    d = json.load(f)
    f.close()
    #d = json.load(urllib2.urlopen(url))
    if len(d["_embedded"]["samples"]) > 0:
        return stringfy_json(d["_embedded"]["samples"][0])
    else:
        return '.\t.'

# main
BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here
if not os.path.isdir(BIOSAMPLE_INFO_DIR):
    os.makedirs(BIOSAMPLE_INFO_DIR)
    
# get first row in the TPM file
TPM_FILE = '../Data/history/expr/TPM.txt'
cmd = 'head -1 %s | perl -ne \'@words = split /\t/; $count = 1; for $x (@words) {print $count, "\t", $x, "\n"; $count++}\' > ../Data/temp/a.txt' % (TPM_FILE)
os.system(cmd)

lst = get_experiment_id('../Data/temp/a.txt')
d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description)
d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
head = ''
for x in lst:
    k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
    s = x
    if k in d:
        s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8')
    elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
        s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
    elif x.startswith('R0002'): # pcubas (Spain) data
        s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
    else:
        s += '\t' + '\t'.join(9*['.']) # k is gene_id
    print(s)
    head += s.replace('\t', '_')  + '\t'