# Usage: python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt # Set cmd = # # Purpose: for each RNA-seq in experiment.and.tissue.1.txt, add a column suggested.tissue as its tissue annotation. # # 2 June 2017, slcu, hui # Last modified 19 June 2017, slcu, hui import os, sys, operator import string def get_singular_form(w): d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'whole plant':'seedling', 'inflorescence':'flower.inflorescence', 'inflorescences':'flower.inflorescence', 'whole seedling':'seedling', 'whole rosette':'leaf.rosette', 'whole aerial seedling':'seedling.aerial', 'vegatative shoot apical meristem':'shoot.apical.meristem', 'inflorescence containing stage 8 and younger flowers':'flower.inflorescence', 'plant roots':'root', 'entire vegetative rosette':'leaf.rosette', 'fungal-colonized plant roots':'root.fungal.colonized', 'rosettes - 5 leaves stage':'leaf.rosette', '2-week old seedlings without roots':'seedling.no.roots', 'immature inflorescence':'inflorescence.immature', 'rosette leaves':'leaf.rosette', 'plant seedling':'seedling', 'entire aerial part':'aerial.tissue', '14-d-old entire seedlings':'seedling', 'rosette leaf':'leaf.rosette', 'whole seedlings':'seedling', 'etiolated 5d-old seedlings':'seedling.etiolated', 'root structure':'root', 'mature leaves':'leaf.mature', 'root tip':'root.tip', '10d-old seedling':'seedling','primary root tip':'root.tip', 'epidermis including guard cells':'epidermis', 'root tip tissue':'root.tip', 'anther stage 4-7':'flower.anther', 'anther':'flower.anther', 'embryo':'seed.embryo', 'etiolated seedlings':'seedling.etiolated', '21 days-old seedlings':'seedling', 'aerial tissue':'aerial.tissue', 'endosperm':'seed.endosperm', 'whole seed':'seed', 'pistils pollinated for 8 hours':'flower.pistil.pollinated', 'primary root':'root', 'whole floral bud':'flower.bud', 'whole seedling root':'seedling.root', 'whole root':'seedling.root', 'whole plants':'seedling', 'aerial shoots':'shoot', 'flower bud':'flower.bud', 'aerial seedling':'seedling.aerial', 'anthers at stage 4-7':'flower.anther', 'carpels (collected manually from 15 developing inflorescences)':'flower.carpel', 'ath_shoot_meristem_1':'shoot.meristem', 'ath_whole_plant_1':'seedling', 'ath_whole_plant_2':'seedling', 'whole seeds':'seed', '3-day-old root':'root', 'unopened flower buds':'flower.bud', 'first true leaf':'leaf', '3-day-old root':'root', '7 dag seedlings':'seedling', 'facs-sorted protoplasts from aerial tissue of 10-day old seedlings':'seedling.protoplasts', 'root tip':'root.tip', 'inflorescences and siliques':'inflorescences.and.siliques', 'Epidermis including guard cells epidermis including guard cells':'leaf.stomata.epidermis', 'base stem':'stem', 'siliques':'silique', 'whole organism':'seedling', 'seedling shoot':'seedling.shoot', 'aerial tissue':'aerial.tissue', '10-day-old seedlings and inflorescences from 25-day-old plants':'seedling.and.inflorescence', 'shoot apical meristem':'shoot.apical.meristem', 'expanded mature leaves from 28 day old plants':'leaf', 'aerial tissues of 15 day seedlings': 'aerial.tissue', 'whole parts':'seedling', 'aerial organs':'aerial.tissue', 'lower stem':'stem', 'upper stem':'stem', 'rosette':'leaf.rosette', 'root and shoot':'root.and.shoot', 'cell culture':'cell.culture', 'aerial part':'aerial.tissue', 'aerial':'aerial.tissue', 'whole plantlet without root':'seedling', 'sorted endodermis (facs)':'endodermis.facs-sorted', 'whole root':'root', 'siluge without seeds':'seed', 'first internode':'stem', 'rosettes':'leaf.rosette', 'hypocotyl':'seedling.hypocotyl', 'somatic embryo':'seed.embryo' } if w in d: return d[w] return w def remove_parenthese(s): if '(' in s: return s[:s.find('(')] return s def make_singular(lst): result = [] # map plural to singular d = {'roots':'root', 'shoots':'shoot', 'leaves':'leaf', 'flowers':'flower', 'anthers':'anther', 'hairs':'hair', 'seedlings':'seedling', 'apices':'apex', 'buds':'bud', 'siliques':'silique', 'rosettes':'rosette', 'meristems':'meristem', 'sepals':'sepal', 'petals':'petal', 'inflorescences':'inflorescence', 'carpels':'carpel', 'seeds':'seed', 'pistils':'pistil', 'stamens':'stamen', 'ovules':'ovule', 'tissues':'tissue', 'ovaries':'ovary', 'veins':'vein', 'nodes':'node', 'internodes':'internode', 'fibres':'fibre', 'hypocotyls':'hypocotyl', 'cotyledons':'cotyledon', 'plants':'plant', 'embryos':'embryo'} for x in lst: if x in d: result.append(d[x]) else: result.append(x) return result def map_tissue(s): ''' given a string s, if all words in a key of d are in s, then the corresponding value is a likely tissue. ''' d = { 'hypocotyl':'seedling.hypocotyl', 'hypocotyl seedling':'seedling.hypocotyl', 'leaf':'leaf', 'leaf petiole':'leaf.petiole', 'petiole':'leaf.petiole', 'leaf blade':'leaf.blade', 'leaf first true':'leaf', 'leaf stomata':'leaf.stomata', 'stomata':'leaf.stomata', 'chlorophyll':'leaf.chlorophyll', 'vein':'leaf.vein', 'leaf vein':'leaf.vein', 'leaf lamina':'leaf.lamina', 'leaf rosette':'leaf.rosette', 'rosette':'leaf.rosette', 'rosette leaf':'leaf.rosette', 'shoot':'shoot', 'aerial shoot':'aerial.shoot', 'shoot apex':'shoot.apex', 'shoot tip':'shoot.apex', 'flower':'flower', 'flower petal':'flower.petal', 'flower sepal':'flower.sepal', 'flower stamen':'flower.stamen', 'flower anther':'flower.anther', 'flower carpel':'flower.carpel', 'flower pistil':'flower.pistil', 'flower inflorescence':'flower.inflorescence', 'stigma':'flower.stigma', 'filament':'flower.filament', 'style':'flower.style', 'anther':'flower.anther', 'petal':'flower.petal', 'sepal':'flower.sepal', 'stamen':'flower.stamen', 'carpel':'flower.carpel', 'pistil':'flower.pistil', 'ovary':'flower.ovary', 'pedicel':'flower.pedicel', 'ovule':'flower.ovule', 'inflorescence':'flower.inflorescence', 'seed':'seed', 'epicotyl':'seed.epicotyl', 'radicle':'seed.radicle', 'embryo':'seed.embryo', 'endosperm':'seed.endosperm', 'endodermis':'endodermis', 'stem':'stem', 'pith':'pith', 'protoxylem':'protoxylem', 'xylem':'xylem', 'phloem':'phloem', 'sclerenchyma':'sclerenchyma', 'bast fibre':'bast.fibre', 'cortex':'cortex', 'parenchyma':'parenchyma', 'mesophyll':'leaf.mesophyll', 'shoot apical meristem':'meristem.shoot.apical', 'root apical meristem':'.meristem.root.apical', 'apical meristem':'meristem.apical', 'floral meristem':'meristem.floral', 'inflorescence meristem':'meristem.inflorescence', 'meristem':'meristem', 'meristem shoot':'meristem.shoot', 'cotyledon':'cotyledon', 'apical':'apical', 'basal':'basal', 'root':'root', 'root apex':'root.apex', 'root tip':'root.tip', 'root primary tip':'root.primary.tip', 'root cap':'root.cap', 'root lateral':'root.lateral', 'root primary':'root.primary', 'root hair':'root.hairs', 'bud':'bud', 'bud axillary':'bud.axillary', 'bud lateral':'bud.axillary', 'bud apical':'bud.apical', 'bud floral':'bud.flower', 'bud flower':'bud.flower', 'bud meristem':'bud.meristem', 'internode':'stem.internode', 'node':'stem.node', 'vascular':'vasculum', 'epidermis':'epidermis', 'seedling':'seedling', 'plant':'seedling', 'whole plant':'seedling', 'whole':'seedling', 'whole parts':'seedling', 'whole root':'root', 'seedling root':'seedling.root', 'seedling shoot':'seedling.shoot', 'seedling etiolated':'seedling.etiolated', 'aerial':'aerial', 'aerial tissue':'aerial.tissue', 'aerial seedling':'seedling.aerial', 'silique':'silique', 'unknown':'unknown', 'siluge':'seed', 'bundle sheath':'leaf' } result = [] # a list of tuples, (tissue, word count) s = s.lower() slst = s.split() slst2 = make_singular(slst) for k in d: # search each key in d klst = k.split() count = 0 exact_count = 0 for x in klst: count += slst2.count(x) if x in slst2: exact_count += 1 if count >= len(klst) and exact_count == len(klst): result.append((d[k], count)) if result == []: return 'unknown' else: sresult = sorted(result, key=operator.itemgetter(1), reverse=True) return sresult[0][0] def repeat_words(s): ''' s in the form of meristem(2) ''' s = s.strip() index = s.find('(') if index < 0: return s index2 = s.find(')') word = s[:index] n = s[(index+1):index2] n = int(n) return ' '.join(n*[word]) def get_words(s): ''' s in the form meristem(2);leaf(2);bud(1) or shoot.meristem ''' lst = s.split(';') result = [] for x in lst: index = x.find('(') if index >= 0: t = repeat_words(x) result.append(t) else: t = x if '.' in t: for y in t.split('.'): result.append(y) return ' '.join(result) def remove_punctuation(s): return s.replace('_', ' ') # main if os.path.exists('../Data/temp/experiment.and.tissue.1.txt'): cmd = 'cut -f 1-4 ../Data/temp/experiment.and.tissue.1.txt > ../Data/temp/a.txt' # generated by python assign_tissue.py os.system(cmd) else: print('Run python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt first.') sys.exit() f = open('../Data/temp/a.txt') print('run.id\tinferred.tissue\tbiosample.tissue\tbiosample.id\tsuggested.tissue') for line in f: line = line.strip() lst = line.split('\t') if lst[2] != 'part_unknown' and lst[2] != '.': s = lst[2].lower() s = map_tissue(remove_punctuation(s)) else: s = lst[1] if not ';' in s: s = remove_parenthese(s) else: s = get_words(s) s = map_tissue(remove_punctuation(s)) print(line + '\t' + s) f.close()