diff options
Diffstat (limited to 'Code/refine_tissue.py')
-rw-r--r-- | Code/refine_tissue.py | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/Code/refine_tissue.py b/Code/refine_tissue.py new file mode 100644 index 0000000..8bc111c --- /dev/null +++ b/Code/refine_tissue.py @@ -0,0 +1,302 @@ +# Usage: python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt +# Set cmd = +# +# Purpose: for each RNA-seq in experiment.and.tissue.1.txt, add a column suggested.tissue as its tissue annotation. +# +# 2 June 2017, slcu, hui +# Last modified 19 June 2017, slcu, hui + +import os, sys, operator +import string + + +def get_singular_form(w): + d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'whole plant':'seedling', 'inflorescence':'flower.inflorescence', 'inflorescences':'flower.inflorescence', 'whole seedling':'seedling', 'whole rosette':'leaf.rosette', 'whole aerial seedling':'seedling.aerial', 'vegatative shoot apical meristem':'shoot.apical.meristem', 'inflorescence containing stage 8 and younger flowers':'flower.inflorescence', 'plant roots':'root', 'entire vegetative rosette':'leaf.rosette', 'fungal-colonized plant roots':'root.fungal.colonized', 'rosettes - 5 leaves stage':'leaf.rosette', '2-week old seedlings without roots':'seedling.no.roots', 'immature inflorescence':'inflorescence.immature', 'rosette leaves':'leaf.rosette', 'plant seedling':'seedling', 'entire aerial part':'aerial.tissue', '14-d-old entire seedlings':'seedling', 'rosette leaf':'leaf.rosette', 'whole seedlings':'seedling', 'etiolated 5d-old seedlings':'seedling.etiolated', 'root structure':'root', 'mature leaves':'leaf.mature', 'root tip':'root.tip', '10d-old seedling':'seedling','primary root tip':'root.tip', + 'epidermis including guard cells':'epidermis', + 'root tip tissue':'root.tip', + 'anther stage 4-7':'flower.anther', + 'anther':'flower.anther', + 'embryo':'seed.embryo', + 'etiolated seedlings':'seedling.etiolated', + '21 days-old seedlings':'seedling', + 'aerial tissue':'aerial.tissue', + 'endosperm':'seed.endosperm', + 'whole seed':'seed', + 'pistils pollinated for 8 hours':'flower.pistil.pollinated', + 'primary root':'root', + 'whole floral bud':'flower.bud', + 'whole seedling root':'seedling.root', + 'whole root':'seedling.root', + 'whole plants':'seedling', + 'aerial shoots':'shoot', + 'flower bud':'flower.bud', + 'aerial seedling':'seedling.aerial', + 'anthers at stage 4-7':'flower.anther', + 'carpels (collected manually from 15 developing inflorescences)':'flower.carpel', + 'ath_shoot_meristem_1':'shoot.meristem', + 'ath_whole_plant_1':'seedling', + 'ath_whole_plant_2':'seedling', + 'whole seeds':'seed', + '3-day-old root':'root', + 'unopened flower buds':'flower.bud', + 'first true leaf':'leaf', + '3-day-old root':'root', + '7 dag seedlings':'seedling', + 'facs-sorted protoplasts from aerial tissue of 10-day old seedlings':'seedling.protoplasts', + 'root tip':'root.tip', + 'inflorescences and siliques':'inflorescences.and.siliques', + 'Epidermis including guard cells epidermis including guard cells':'leaf.stomata.epidermis', + 'base stem':'stem', + 'siliques':'silique', + 'whole organism':'seedling', + 'seedling shoot':'seedling.shoot', + 'aerial tissue':'aerial.tissue', + '10-day-old seedlings and inflorescences from 25-day-old plants':'seedling.and.inflorescence', + 'shoot apical meristem':'shoot.apical.meristem', + 'expanded mature leaves from 28 day old plants':'leaf', + 'aerial tissues of 15 day seedlings': 'aerial.tissue', + 'whole parts':'seedling', + 'aerial organs':'aerial.tissue', + 'lower stem':'stem', + 'upper stem':'stem', + 'rosette':'leaf.rosette', + 'root and shoot':'root.and.shoot', + 'cell culture':'cell.culture', + 'aerial part':'aerial.tissue', + 'aerial':'aerial.tissue', + 'whole plantlet without root':'seedling', + 'sorted endodermis (facs)':'endodermis.facs-sorted', + 'whole root':'root', + 'siluge without seeds':'seed', + 'first internode':'stem', + 'rosettes':'leaf.rosette', + 'hypocotyl':'seedling.hypocotyl', + 'somatic embryo':'seed.embryo' + } + if w in d: + return d[w] + return w + +def remove_parenthese(s): + if '(' in s: + return s[:s.find('(')] + return s + + + +def make_singular(lst): + result = [] + # map plural to singular + d = {'roots':'root', 'shoots':'shoot', + 'leaves':'leaf', 'flowers':'flower', + 'anthers':'anther', 'hairs':'hair', + 'seedlings':'seedling', 'apices':'apex', + 'buds':'bud', 'siliques':'silique', + 'rosettes':'rosette', 'meristems':'meristem', + 'sepals':'sepal', 'petals':'petal', + 'inflorescences':'inflorescence', 'carpels':'carpel', + 'seeds':'seed', 'pistils':'pistil', + 'stamens':'stamen', 'ovules':'ovule', + 'tissues':'tissue', 'ovaries':'ovary', + 'veins':'vein', 'nodes':'node', + 'internodes':'internode', 'fibres':'fibre', + 'hypocotyls':'hypocotyl', 'cotyledons':'cotyledon', + 'plants':'plant', 'embryos':'embryo'} + + for x in lst: + if x in d: + result.append(d[x]) + else: + result.append(x) + return result + +def map_tissue(s): + ''' given a string s, if all words in a key of d are in s, then the corresponding value is a likely tissue. ''' + d = { + 'hypocotyl':'seedling.hypocotyl', + 'hypocotyl seedling':'seedling.hypocotyl', + 'leaf':'leaf', + 'leaf petiole':'leaf.petiole', + 'petiole':'leaf.petiole', + 'leaf blade':'leaf.blade', + 'leaf first true':'leaf', + 'leaf stomata':'leaf.stomata', + 'stomata':'leaf.stomata', + 'chlorophyll':'leaf.chlorophyll', + 'vein':'leaf.vein', + 'leaf vein':'leaf.vein', + 'leaf lamina':'leaf.lamina', + 'leaf rosette':'leaf rosette', + 'rosette':'leaf.rosette', + 'rosette leaf':'leaf.rosette', + 'shoot':'shoot', + 'aerial shoot':'aerial.shoot', + 'shoot apex':'shoot.apex', + 'shoot tip':'shoot.apex', + 'flower':'flower', + 'flower petal':'flower.petal', + 'flower sepal':'flower.sepal', + 'flower stamen':'flower.stamen', + 'flower anther':'flower.anther', + 'flower carpel':'flower.carpel', + 'flower pistil':'flower.pistil', + 'flower inflorescence':'flower.inflorescence', + 'stigma':'flower.stigma', + 'filament':'flower.filament', + 'style':'flower.style', + 'anther':'flower.anther', + 'petal':'flower.petal', + 'sepal':'flower.sepal', + 'stamen':'flower.stamen', + 'carpel':'flower.carpel', + 'pistil':'flower.pistil', + 'ovary':'flower.ovary', + 'pedicel':'flower.pedicel', + 'ovule':'flower.ovule', + 'inflorescence':'flower.inflorescence', + 'seed':'seed', + 'epicotyl':'seed.epicotyl', + 'radicle':'seed.radicle', + 'embryo':'seed.embryo', + 'endosperm':'seed.endosperm', + 'endodermis':'endodermis', + 'stem':'stem', + 'pith':'pith', + 'protoxylem':'protoxylem', + 'xylem':'xylem', + 'phloem':'phloem', + 'sclerenchyma':'sclerenchyma', + 'bast fibre':'bast.fibre', + 'cortex':'cortex', + 'parenchyma':'parenchyma', + 'mesophyll':'leaf.mesophyll', + 'shoot apical meristem':'meristem.shoot.apical', + 'root apical meristem':'.meristem.root.apical', + 'apical meristem':'meristem.apical', + 'floral meristem':'meristem.floral', + 'inflorescence meristem':'meristem.inflorescence', + 'meristem':'meristem', + 'meristem shoot':'meristem.shoot', + 'cotyledon':'cotyledon', + 'apical':'apical', + 'basal':'basal', + 'root':'root', + 'root apex':'root.apex', + 'root tip':'root.tip', + 'root primary tip':'root.primary.tip', + 'root cap':'root.cap', + 'root lateral':'root.lateral', + 'root primary':'root.primary', + 'root hair':'root.hairs', + 'bud':'bud', + 'bud axillary':'bud.axillary', + 'bud lateral':'bud.axillary', + 'bud apical':'bud.apical', + 'bud floral':'bud.flower', + 'bud flower':'bud.flower', + 'bud meristem':'bud.meristem', + 'internode':'stem.internode', + 'node':'stem.node', + 'vascular':'vasculum', + 'epidermis':'epidermis', + 'seedling':'seedling', + 'plant':'seedling', + 'whole plant':'seedling', + 'whole':'seedling', + 'whole parts':'seedling', + 'whole root':'root', + 'seedling root':'seedling.root', + 'seedling shoot':'seedling.shoot', + 'seedling etiolated':'seedling.etiolated', + 'aerial':'aerial', + 'aerial tissue':'aerial.tissue', + 'aerial seedling':'seedling.aerial', + 'silique':'silique', + 'unknown':'unknown', + 'siluge':'seed', + 'bundle sheath':'leaf' + } + result = [] # a list of tuples, (tissue, word count) + s = s.lower() + slst = s.split() + slst2 = make_singular(slst) + for k in d: # search each key in d + klst = k.split() + count = 0 + exact_count = 0 + for x in klst: + count += slst2.count(x) + if x in slst2: + exact_count += 1 + if count >= len(klst) and exact_count == len(klst): + result.append((d[k], count)) + if result == []: + return 'unknown' + else: + sresult = sorted(result, key=operator.itemgetter(1), reverse=True) + return sresult[0][0] + + +def repeat_words(s): + ''' s in the form of meristem(2) ''' + s = s.strip() + index = s.find('(') + if index < 0: + return s + index2 = s.find(')') + word = s[:index] + n = s[(index+1):index2] + n = int(n) + return ' '.join(n*[word]) + +def get_words(s): + ''' s in the form meristem(2);leaf(2);bud(1) or shoot.meristem ''' + lst = s.split(';') + result = [] + for x in lst: + index = x.find('(') + if index >= 0: + t = repeat_words(x) + result.append(t) + else: + t = x + if '.' in t: + for y in t.split('.'): + result.append(y) + return ' '.join(result) + +def remove_punctuation(s): + return s.replace('_', ' ') + +# main + +if os.path.exists('../Data/temp/experiment.and.tissue.1.txt'): + cmd = 'cut -f 1-4 ../Data/temp/experiment.and.tissue.1.txt > ../Data/temp/a.txt' # generated by python assign_tissue.py + os.system(cmd) +else: + print('Run python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt first.') + sys.exit() + +f = open('../Data/temp/a.txt') +print('run.id\tinferred.tissue\tbiosample.tissue\tbiosample.id\tsuggested.tissue') +for line in f: + line = line.strip() + lst = line.split('\t') + + if lst[2] != 'part_unknown' and lst[2] != '.': + s = lst[2].lower() + s = map_tissue(remove_punctuation(s)) + # elif lst[2] == '.': + # s = lst[1] + # s = remove_parenthese(s) + else: + s = lst[1] + if not ';' in s: + s = remove_parenthese(s) + else: + s = get_words(s) + + s = map_tissue(remove_punctuation(s)) + + print(line + '\t' + s) +f.close() |