# Usage: python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt
#        Set cmd =
#
# Purpose: for each RNA-seq in experiment.and.tissue.1.txt, add a column suggested.tissue as its tissue annotation.
#
# 2 June 2017, slcu, hui
# Last modified 19 June 2017, slcu, hui

import os, sys, operator
import string


def get_singular_form(w):
    d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'whole plant':'seedling', 'inflorescence':'flower.inflorescence', 'inflorescences':'flower.inflorescence', 'whole seedling':'seedling', 'whole rosette':'leaf.rosette', 'whole aerial seedling':'seedling.aerial', 'vegatative shoot apical meristem':'shoot.apical.meristem', 'inflorescence containing stage 8 and younger flowers':'flower.inflorescence', 'plant roots':'root', 'entire vegetative rosette':'leaf.rosette', 'fungal-colonized plant roots':'root.fungal.colonized', 'rosettes - 5 leaves stage':'leaf.rosette', '2-week old seedlings without roots':'seedling.no.roots', 'immature inflorescence':'inflorescence.immature', 'rosette leaves':'leaf.rosette', 'plant seedling':'seedling', 'entire aerial part':'aerial.tissue', '14-d-old entire seedlings':'seedling', 'rosette leaf':'leaf.rosette', 'whole seedlings':'seedling', 'etiolated 5d-old seedlings':'seedling.etiolated', 'root structure':'root', 'mature leaves':'leaf.mature', 'root tip':'root.tip', '10d-old seedling':'seedling','primary root tip':'root.tip',
         'epidermis including guard cells':'epidermis',
         'root tip tissue':'root.tip',
         'anther stage 4-7':'flower.anther',
         'anther':'flower.anther',
         'embryo':'seed.embryo',
         'etiolated seedlings':'seedling.etiolated',
         '21 days-old seedlings':'seedling',
         'aerial tissue':'aerial.tissue',
         'endosperm':'seed.endosperm',
         'whole seed':'seed',
         'pistils pollinated for 8 hours':'flower.pistil.pollinated',
         'primary root':'root',
         'whole floral bud':'flower.bud',
         'whole seedling root':'seedling.root',
         'whole root':'seedling.root',
         'whole plants':'seedling',
         'aerial shoots':'shoot',
         'flower bud':'flower.bud',
         'aerial seedling':'seedling.aerial',
         'anthers at stage 4-7':'flower.anther',
         'carpels (collected manually from 15 developing inflorescences)':'flower.carpel',
         'ath_shoot_meristem_1':'shoot.meristem',
         'ath_whole_plant_1':'seedling',
         'ath_whole_plant_2':'seedling',
         'whole seeds':'seed',
         '3-day-old root':'root',
         'unopened flower buds':'flower.bud',
         'first true leaf':'leaf',
         '3-day-old root':'root',
         '7 dag seedlings':'seedling',
         'facs-sorted protoplasts from aerial tissue of 10-day old seedlings':'seedling.protoplasts',
         'root tip':'root.tip',
         'inflorescences and siliques':'inflorescences.and.siliques',
         'Epidermis including guard cells epidermis including guard cells':'leaf.stomata.epidermis',
         'base stem':'stem',
         'siliques':'silique',
         'whole organism':'seedling',
         'seedling shoot':'seedling.shoot',
         'aerial tissue':'aerial.tissue',
         '10-day-old seedlings and inflorescences from 25-day-old plants':'seedling.and.inflorescence',
         'shoot apical meristem':'shoot.apical.meristem',
         'expanded mature leaves from 28 day old plants':'leaf',
         'aerial tissues of 15 day seedlings': 'aerial.tissue',
         'whole parts':'seedling',
         'aerial organs':'aerial.tissue',
         'lower stem':'stem',
         'upper stem':'stem',
         'rosette':'leaf.rosette',
         'root and shoot':'root.and.shoot',
         'cell culture':'cell.culture',
         'aerial part':'aerial.tissue',
         'aerial':'aerial.tissue',
         'whole plantlet without root':'seedling',
         'sorted endodermis (facs)':'endodermis.facs-sorted',
         'whole root':'root',
         'siluge without seeds':'seed',
         'first internode':'stem',
         'rosettes':'leaf.rosette',
         'hypocotyl':'seedling.hypocotyl',
         'somatic embryo':'seed.embryo'
    }
    if w in d:
        return d[w]
    return w

def remove_parenthese(s):
    if '(' in s:
        return s[:s.find('(')]
    return s



def make_singular(lst):
    result = []
    # map plural to singular
    d = {'roots':'root', 'shoots':'shoot',
         'leaves':'leaf', 'flowers':'flower',
         'anthers':'anther', 'hairs':'hair',
         'seedlings':'seedling', 'apices':'apex',
         'buds':'bud', 'siliques':'silique',
         'rosettes':'rosette', 'meristems':'meristem',
         'sepals':'sepal', 'petals':'petal',
         'inflorescences':'inflorescence', 'carpels':'carpel',
         'seeds':'seed', 'pistils':'pistil',
         'stamens':'stamen', 'ovules':'ovule',
         'tissues':'tissue', 'ovaries':'ovary',
         'veins':'vein', 'nodes':'node',
         'internodes':'internode', 'fibres':'fibre',
         'hypocotyls':'hypocotyl', 'cotyledons':'cotyledon',
         'plants':'plant', 'embryos':'embryo'}
    
    for x in lst:
        if x in d:
            result.append(d[x])
        else:
            result.append(x)
    return result

def map_tissue(s):
    ''' given a string s, if all words in a key of d are in s, then the corresponding value is a likely tissue. '''
    d = {
        'hypocotyl':'seedling.hypocotyl',
        'hypocotyl seedling':'seedling.hypocotyl',
        'leaf':'leaf',
        'leaf petiole':'leaf.petiole',
        'petiole':'leaf.petiole',        
        'leaf blade':'leaf.blade',
        'leaf first true':'leaf',
        'leaf stomata':'leaf.stomata',
        'stomata':'leaf.stomata',
        'chlorophyll':'leaf.chlorophyll',
        'vein':'leaf.vein',
        'leaf vein':'leaf.vein',
        'leaf lamina':'leaf.lamina',                
        'leaf rosette':'leaf rosette',
        'rosette':'leaf.rosette',
        'rosette leaf':'leaf.rosette',
        'shoot':'shoot',
        'aerial shoot':'aerial.shoot',
        'shoot apex':'shoot.apex',
        'shoot tip':'shoot.apex',
        'flower':'flower',
        'flower petal':'flower.petal',
        'flower sepal':'flower.sepal',
        'flower stamen':'flower.stamen',    
        'flower anther':'flower.anther',
        'flower carpel':'flower.carpel',
        'flower pistil':'flower.pistil',
        'flower inflorescence':'flower.inflorescence',
        'stigma':'flower.stigma',
        'filament':'flower.filament',
        'style':'flower.style',        
        'anther':'flower.anther',    
        'petal':'flower.petal',
        'sepal':'flower.sepal',
        'stamen':'flower.stamen',    
        'carpel':'flower.carpel',
        'pistil':'flower.pistil',
        'ovary':'flower.ovary',
        'pedicel':'flower.pedicel',
        'ovule':'flower.ovule',        
        'inflorescence':'flower.inflorescence',    
        'seed':'seed',
        'epicotyl':'seed.epicotyl',
        'radicle':'seed.radicle',        
        'embryo':'seed.embryo',
        'endosperm':'seed.endosperm',
        'endodermis':'endodermis',        
        'stem':'stem',
        'pith':'pith',
        'protoxylem':'protoxylem',
        'xylem':'xylem',
        'phloem':'phloem',
        'sclerenchyma':'sclerenchyma',
        'bast fibre':'bast.fibre',
        'cortex':'cortex',
        'parenchyma':'parenchyma',
        'mesophyll':'leaf.mesophyll',
        'shoot apical meristem':'meristem.shoot.apical',
        'root apical meristem':'.meristem.root.apical',
        'apical meristem':'meristem.apical',
        'floral meristem':'meristem.floral',
        'inflorescence meristem':'meristem.inflorescence',        
        'meristem':'meristem',
        'meristem shoot':'meristem.shoot',        
        'cotyledon':'cotyledon',
        'apical':'apical',
        'basal':'basal',
        'root':'root',
        'root apex':'root.apex',
        'root tip':'root.tip',
        'root primary tip':'root.primary.tip',        
        'root cap':'root.cap',
        'root lateral':'root.lateral',
        'root primary':'root.primary',
        'root hair':'root.hairs',
        'bud':'bud',
        'bud axillary':'bud.axillary',
        'bud lateral':'bud.axillary',
        'bud apical':'bud.apical',
        'bud floral':'bud.flower',
        'bud flower':'bud.flower',
        'bud meristem':'bud.meristem',        
        'internode':'stem.internode',
        'node':'stem.node',
        'vascular':'vasculum',
        'epidermis':'epidermis',
        'seedling':'seedling',
        'plant':'seedling',        
        'whole plant':'seedling',
        'whole':'seedling',
        'whole parts':'seedling',
        'whole root':'root',        
        'seedling root':'seedling.root',
        'seedling shoot':'seedling.shoot',
        'seedling etiolated':'seedling.etiolated',
        'aerial':'aerial',
        'aerial tissue':'aerial.tissue',
        'aerial seedling':'seedling.aerial',
        'silique':'silique',
        'unknown':'unknown',
        'siluge':'seed',
        'bundle sheath':'leaf'
    }
    result = [] # a list of tuples, (tissue, word count)
    s = s.lower()
    slst = s.split()
    slst2 = make_singular(slst)
    for k in d: # search each key in d
        klst = k.split()
        count = 0
        exact_count = 0
        for x in klst:
            count += slst2.count(x)
            if x in slst2:
                exact_count += 1
        if count >= len(klst) and exact_count == len(klst):
            result.append((d[k], count))
    if result == []:
        return 'unknown'
    else:
        sresult = sorted(result, key=operator.itemgetter(1), reverse=True)
        return sresult[0][0]


def repeat_words(s):
    ''' s in the form of meristem(2) '''
    s = s.strip()
    index = s.find('(')
    if index < 0:
        return s
    index2 = s.find(')')
    word = s[:index]
    n = s[(index+1):index2]
    n = int(n)
    return ' '.join(n*[word])

def get_words(s):
    '''  s in the form meristem(2);leaf(2);bud(1) or shoot.meristem '''
    lst = s.split(';')
    result = []
    for x in lst:
        index = x.find('(')
        if index >= 0:
            t = repeat_words(x)
            result.append(t)            
        else:
            t = x
            if '.' in t:
                for y in t.split('.'):
                    result.append(y)
    return ' '.join(result)

def remove_punctuation(s):
    return s.replace('_', ' ')

# main

if os.path.exists('../Data/temp/experiment.and.tissue.1.txt'):
    cmd = 'cut -f 1-4 ../Data/temp/experiment.and.tissue.1.txt > ../Data/temp/a.txt' # generated by python assign_tissue.py
    os.system(cmd)
else:
    print('Run python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt first.')
    sys.exit()
    
f = open('../Data/temp/a.txt')
print('run.id\tinferred.tissue\tbiosample.tissue\tbiosample.id\tsuggested.tissue')
for line in f:
    line = line.strip()
    lst = line.split('\t')
    
    if lst[2] != 'part_unknown' and lst[2] != '.':
        s = lst[2].lower()
        s = map_tissue(remove_punctuation(s))
    # elif lst[2] == '.':
    #     s = lst[1]
    #     s = remove_parenthese(s)
    else:
        s = lst[1]
        if not ';' in s:
            s = remove_parenthese(s)
        else:
            s = get_words(s)

        s = map_tissue(remove_punctuation(s))

    print(line + '\t' + s)
f.close()