summaryrefslogtreecommitdiff
path: root/Code/refine_tissue.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/refine_tissue.py')
-rw-r--r--Code/refine_tissue.py302
1 files changed, 302 insertions, 0 deletions
diff --git a/Code/refine_tissue.py b/Code/refine_tissue.py
new file mode 100644
index 0000000..8bc111c
--- /dev/null
+++ b/Code/refine_tissue.py
@@ -0,0 +1,302 @@
+# Usage: python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt
+# Set cmd =
+#
+# Purpose: for each RNA-seq in experiment.and.tissue.1.txt, add a column suggested.tissue as its tissue annotation.
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import os, sys, operator
+import string
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'whole plant':'seedling', 'inflorescence':'flower.inflorescence', 'inflorescences':'flower.inflorescence', 'whole seedling':'seedling', 'whole rosette':'leaf.rosette', 'whole aerial seedling':'seedling.aerial', 'vegatative shoot apical meristem':'shoot.apical.meristem', 'inflorescence containing stage 8 and younger flowers':'flower.inflorescence', 'plant roots':'root', 'entire vegetative rosette':'leaf.rosette', 'fungal-colonized plant roots':'root.fungal.colonized', 'rosettes - 5 leaves stage':'leaf.rosette', '2-week old seedlings without roots':'seedling.no.roots', 'immature inflorescence':'inflorescence.immature', 'rosette leaves':'leaf.rosette', 'plant seedling':'seedling', 'entire aerial part':'aerial.tissue', '14-d-old entire seedlings':'seedling', 'rosette leaf':'leaf.rosette', 'whole seedlings':'seedling', 'etiolated 5d-old seedlings':'seedling.etiolated', 'root structure':'root', 'mature leaves':'leaf.mature', 'root tip':'root.tip', '10d-old seedling':'seedling','primary root tip':'root.tip',
+ 'epidermis including guard cells':'epidermis',
+ 'root tip tissue':'root.tip',
+ 'anther stage 4-7':'flower.anther',
+ 'anther':'flower.anther',
+ 'embryo':'seed.embryo',
+ 'etiolated seedlings':'seedling.etiolated',
+ '21 days-old seedlings':'seedling',
+ 'aerial tissue':'aerial.tissue',
+ 'endosperm':'seed.endosperm',
+ 'whole seed':'seed',
+ 'pistils pollinated for 8 hours':'flower.pistil.pollinated',
+ 'primary root':'root',
+ 'whole floral bud':'flower.bud',
+ 'whole seedling root':'seedling.root',
+ 'whole root':'seedling.root',
+ 'whole plants':'seedling',
+ 'aerial shoots':'shoot',
+ 'flower bud':'flower.bud',
+ 'aerial seedling':'seedling.aerial',
+ 'anthers at stage 4-7':'flower.anther',
+ 'carpels (collected manually from 15 developing inflorescences)':'flower.carpel',
+ 'ath_shoot_meristem_1':'shoot.meristem',
+ 'ath_whole_plant_1':'seedling',
+ 'ath_whole_plant_2':'seedling',
+ 'whole seeds':'seed',
+ '3-day-old root':'root',
+ 'unopened flower buds':'flower.bud',
+ 'first true leaf':'leaf',
+ '3-day-old root':'root',
+ '7 dag seedlings':'seedling',
+ 'facs-sorted protoplasts from aerial tissue of 10-day old seedlings':'seedling.protoplasts',
+ 'root tip':'root.tip',
+ 'inflorescences and siliques':'inflorescences.and.siliques',
+ 'Epidermis including guard cells epidermis including guard cells':'leaf.stomata.epidermis',
+ 'base stem':'stem',
+ 'siliques':'silique',
+ 'whole organism':'seedling',
+ 'seedling shoot':'seedling.shoot',
+ 'aerial tissue':'aerial.tissue',
+ '10-day-old seedlings and inflorescences from 25-day-old plants':'seedling.and.inflorescence',
+ 'shoot apical meristem':'shoot.apical.meristem',
+ 'expanded mature leaves from 28 day old plants':'leaf',
+ 'aerial tissues of 15 day seedlings': 'aerial.tissue',
+ 'whole parts':'seedling',
+ 'aerial organs':'aerial.tissue',
+ 'lower stem':'stem',
+ 'upper stem':'stem',
+ 'rosette':'leaf.rosette',
+ 'root and shoot':'root.and.shoot',
+ 'cell culture':'cell.culture',
+ 'aerial part':'aerial.tissue',
+ 'aerial':'aerial.tissue',
+ 'whole plantlet without root':'seedling',
+ 'sorted endodermis (facs)':'endodermis.facs-sorted',
+ 'whole root':'root',
+ 'siluge without seeds':'seed',
+ 'first internode':'stem',
+ 'rosettes':'leaf.rosette',
+ 'hypocotyl':'seedling.hypocotyl',
+ 'somatic embryo':'seed.embryo'
+ }
+ if w in d:
+ return d[w]
+ return w
+
+def remove_parenthese(s):
+ if '(' in s:
+ return s[:s.find('(')]
+ return s
+
+
+
+def make_singular(lst):
+ result = []
+ # map plural to singular
+ d = {'roots':'root', 'shoots':'shoot',
+ 'leaves':'leaf', 'flowers':'flower',
+ 'anthers':'anther', 'hairs':'hair',
+ 'seedlings':'seedling', 'apices':'apex',
+ 'buds':'bud', 'siliques':'silique',
+ 'rosettes':'rosette', 'meristems':'meristem',
+ 'sepals':'sepal', 'petals':'petal',
+ 'inflorescences':'inflorescence', 'carpels':'carpel',
+ 'seeds':'seed', 'pistils':'pistil',
+ 'stamens':'stamen', 'ovules':'ovule',
+ 'tissues':'tissue', 'ovaries':'ovary',
+ 'veins':'vein', 'nodes':'node',
+ 'internodes':'internode', 'fibres':'fibre',
+ 'hypocotyls':'hypocotyl', 'cotyledons':'cotyledon',
+ 'plants':'plant', 'embryos':'embryo'}
+
+ for x in lst:
+ if x in d:
+ result.append(d[x])
+ else:
+ result.append(x)
+ return result
+
+def map_tissue(s):
+ ''' given a string s, if all words in a key of d are in s, then the corresponding value is a likely tissue. '''
+ d = {
+ 'hypocotyl':'seedling.hypocotyl',
+ 'hypocotyl seedling':'seedling.hypocotyl',
+ 'leaf':'leaf',
+ 'leaf petiole':'leaf.petiole',
+ 'petiole':'leaf.petiole',
+ 'leaf blade':'leaf.blade',
+ 'leaf first true':'leaf',
+ 'leaf stomata':'leaf.stomata',
+ 'stomata':'leaf.stomata',
+ 'chlorophyll':'leaf.chlorophyll',
+ 'vein':'leaf.vein',
+ 'leaf vein':'leaf.vein',
+ 'leaf lamina':'leaf.lamina',
+ 'leaf rosette':'leaf rosette',
+ 'rosette':'leaf.rosette',
+ 'rosette leaf':'leaf.rosette',
+ 'shoot':'shoot',
+ 'aerial shoot':'aerial.shoot',
+ 'shoot apex':'shoot.apex',
+ 'shoot tip':'shoot.apex',
+ 'flower':'flower',
+ 'flower petal':'flower.petal',
+ 'flower sepal':'flower.sepal',
+ 'flower stamen':'flower.stamen',
+ 'flower anther':'flower.anther',
+ 'flower carpel':'flower.carpel',
+ 'flower pistil':'flower.pistil',
+ 'flower inflorescence':'flower.inflorescence',
+ 'stigma':'flower.stigma',
+ 'filament':'flower.filament',
+ 'style':'flower.style',
+ 'anther':'flower.anther',
+ 'petal':'flower.petal',
+ 'sepal':'flower.sepal',
+ 'stamen':'flower.stamen',
+ 'carpel':'flower.carpel',
+ 'pistil':'flower.pistil',
+ 'ovary':'flower.ovary',
+ 'pedicel':'flower.pedicel',
+ 'ovule':'flower.ovule',
+ 'inflorescence':'flower.inflorescence',
+ 'seed':'seed',
+ 'epicotyl':'seed.epicotyl',
+ 'radicle':'seed.radicle',
+ 'embryo':'seed.embryo',
+ 'endosperm':'seed.endosperm',
+ 'endodermis':'endodermis',
+ 'stem':'stem',
+ 'pith':'pith',
+ 'protoxylem':'protoxylem',
+ 'xylem':'xylem',
+ 'phloem':'phloem',
+ 'sclerenchyma':'sclerenchyma',
+ 'bast fibre':'bast.fibre',
+ 'cortex':'cortex',
+ 'parenchyma':'parenchyma',
+ 'mesophyll':'leaf.mesophyll',
+ 'shoot apical meristem':'meristem.shoot.apical',
+ 'root apical meristem':'.meristem.root.apical',
+ 'apical meristem':'meristem.apical',
+ 'floral meristem':'meristem.floral',
+ 'inflorescence meristem':'meristem.inflorescence',
+ 'meristem':'meristem',
+ 'meristem shoot':'meristem.shoot',
+ 'cotyledon':'cotyledon',
+ 'apical':'apical',
+ 'basal':'basal',
+ 'root':'root',
+ 'root apex':'root.apex',
+ 'root tip':'root.tip',
+ 'root primary tip':'root.primary.tip',
+ 'root cap':'root.cap',
+ 'root lateral':'root.lateral',
+ 'root primary':'root.primary',
+ 'root hair':'root.hairs',
+ 'bud':'bud',
+ 'bud axillary':'bud.axillary',
+ 'bud lateral':'bud.axillary',
+ 'bud apical':'bud.apical',
+ 'bud floral':'bud.flower',
+ 'bud flower':'bud.flower',
+ 'bud meristem':'bud.meristem',
+ 'internode':'stem.internode',
+ 'node':'stem.node',
+ 'vascular':'vasculum',
+ 'epidermis':'epidermis',
+ 'seedling':'seedling',
+ 'plant':'seedling',
+ 'whole plant':'seedling',
+ 'whole':'seedling',
+ 'whole parts':'seedling',
+ 'whole root':'root',
+ 'seedling root':'seedling.root',
+ 'seedling shoot':'seedling.shoot',
+ 'seedling etiolated':'seedling.etiolated',
+ 'aerial':'aerial',
+ 'aerial tissue':'aerial.tissue',
+ 'aerial seedling':'seedling.aerial',
+ 'silique':'silique',
+ 'unknown':'unknown',
+ 'siluge':'seed',
+ 'bundle sheath':'leaf'
+ }
+ result = [] # a list of tuples, (tissue, word count)
+ s = s.lower()
+ slst = s.split()
+ slst2 = make_singular(slst)
+ for k in d: # search each key in d
+ klst = k.split()
+ count = 0
+ exact_count = 0
+ for x in klst:
+ count += slst2.count(x)
+ if x in slst2:
+ exact_count += 1
+ if count >= len(klst) and exact_count == len(klst):
+ result.append((d[k], count))
+ if result == []:
+ return 'unknown'
+ else:
+ sresult = sorted(result, key=operator.itemgetter(1), reverse=True)
+ return sresult[0][0]
+
+
+def repeat_words(s):
+ ''' s in the form of meristem(2) '''
+ s = s.strip()
+ index = s.find('(')
+ if index < 0:
+ return s
+ index2 = s.find(')')
+ word = s[:index]
+ n = s[(index+1):index2]
+ n = int(n)
+ return ' '.join(n*[word])
+
+def get_words(s):
+ ''' s in the form meristem(2);leaf(2);bud(1) or shoot.meristem '''
+ lst = s.split(';')
+ result = []
+ for x in lst:
+ index = x.find('(')
+ if index >= 0:
+ t = repeat_words(x)
+ result.append(t)
+ else:
+ t = x
+ if '.' in t:
+ for y in t.split('.'):
+ result.append(y)
+ return ' '.join(result)
+
+def remove_punctuation(s):
+ return s.replace('_', ' ')
+
+# main
+
+if os.path.exists('../Data/temp/experiment.and.tissue.1.txt'):
+ cmd = 'cut -f 1-4 ../Data/temp/experiment.and.tissue.1.txt > ../Data/temp/a.txt' # generated by python assign_tissue.py
+ os.system(cmd)
+else:
+ print('Run python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt first.')
+ sys.exit()
+
+f = open('../Data/temp/a.txt')
+print('run.id\tinferred.tissue\tbiosample.tissue\tbiosample.id\tsuggested.tissue')
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+
+ if lst[2] != 'part_unknown' and lst[2] != '.':
+ s = lst[2].lower()
+ s = map_tissue(remove_punctuation(s))
+ # elif lst[2] == '.':
+ # s = lst[1]
+ # s = remove_parenthese(s)
+ else:
+ s = lst[1]
+ if not ';' in s:
+ s = remove_parenthese(s)
+ else:
+ s = get_words(s)
+
+ s = map_tissue(remove_punctuation(s))
+
+ print(line + '\t' + s)
+f.close()