3 files changed, 40 insertions, 38 deletions
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py
index 782ab78..4a81e18 100644
--- a/Code/assign_tissue.py
+++ b/Code/assign_tissue.py
@@ -6,9 +6,9 @@
 #
 # 2 June 2017, slcu, hui
 # Last modified 19 June 2017, slcu, hui
+# Last modified 5 November 2022, hui
 
 import os, sys, json
-import urllib2
 
 def make_tissue_dict(fname):
     f = open(fname)
@@ -42,7 +42,7 @@ def make_sample_dict(fname):
     lines = f.readlines()
     f.close()
     d = {}
-    for line in lines[1:]:
+    for line in lines[1:]: # skip the head line
         line = line.strip()
         lst = line.split('\t')
         if len(lst) >= 2:
@@ -94,24 +94,23 @@ def stringfy_json(d):
 
 def make_information(s, info_dir):
     lst = s.split('...')
-    sample_id = lst[0]
+    sample_id = lst[0] # looks like SAMD00012930
     filename = '%s/%s.json' % (info_dir, sample_id)
-    #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id)
     if not os.path.exists(filename):
-        cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)        
+        #cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)
+        cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/samples/%s.json > %s' % (sample_id, filename)
         os.system(cmd)
 
     f = open(filename)
     d = json.load(f)
     f.close()
-    #d = json.load(urllib2.urlopen(url))
-    if len(d["_embedded"]["samples"]) > 0:
-        return stringfy_json(d["_embedded"]["samples"][0])
+    if 'tissue' in d['characteristics']:
+        return d['characteristics']['tissue'][0]['text'].lower() + '\t' + sample_id
     else:
-        return '.\t.'
+        return '.' + '\t' + sample_id
 
 # main
-BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here
+BIOSAMPLE_INFO_DIR = '../Data/information/BioSample' # put downloaded BioSample json files here
 if not os.path.isdir(BIOSAMPLE_INFO_DIR):
     os.makedirs(BIOSAMPLE_INFO_DIR)
     
@@ -122,18 +121,21 @@ os.system(cmd)
 
 lst = get_experiment_id('../Data/temp/a.txt')
 d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description)
-d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
+d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt.temp') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
 head = ''
-for x in lst:
-    k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
-    s = x
-    if k in d:
-        s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8')
-    elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
-        s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
-    elif x.startswith('R0002'): # pcubas (Spain) data
-        s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
-    else:
-        s += '\t' + '\t'.join(9*['.']) # k is gene_id
-    print(s)
-    head += s.replace('\t', '_')  + '\t'
+# run.id  inferred.tissue biosample.tissue biosample.id
+with open('../Data/temp/experiment.and.tissue.1.txt', 'w', encoding='utf8') as f:
+    for x in lst:
+        s = x  # run id, such as R0DRR016125XXX
+        k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
+        if k in d:
+            s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0], BIOSAMPLE_INFO_DIR)
+        elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
+            s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
+        elif x.startswith('R0002'): # pcubas (Spain) data
+            s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
+        else:
+            s += '\t' + '\t'.join(9*['.']) # k is gene_id
+        f.write(s + '\n')
+        head += s.replace('\t', '_')  + '\t'
+
diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R
index 3f48220..3407973 100644
--- a/Code/correlation_per_group_fixed_number.R
+++ b/Code/correlation_per_group_fixed_number.R
@@ -7,7 +7,7 @@
 # the groups and tissue label.  More specifically, within each group
 # there should be as few distinct tissues as possible.
 
-TISSUE.FILE    <- '../Data/information/experiment.and.tissue.txt'
+TISSUE.FILE    <- '../Data/information/experiment.and.tissue.2.txt'
 DATA.FILE      <- '../Data/history/expr/TPM.txt'
 TARGET.TF.FILE <- '../Data/information/target_tf.txt'
 AGINAME.FILE   <- '../Data/information/AGI-to-gene-names_v2.txt'
@@ -70,30 +70,33 @@ normalize <- function(X) {
 
 
 # Choose the optimal number of clusters such that they have best agreement with tissue labels
-# Added on 28 June 2017, slcu, hui
+# Added on 28 June 2017, slcu, hui, last modified on 5 Nov 2022, hui
 get.optimal.number.of.clusters <- function(X, clusters, tissue.matrix, min.cluster, max.cluster) {
     labels <- as.vector(tissue.matrix$suggested.tissue)
     labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
-    tissue.label <- c()
+    run.label <- c() # what tissue does each run experiment come from?
     for (rseqid in colnames(X)) { # X is the gene expression matrix
         i <- which(as.vector(tissue.matrix$run.id) == rseqid) # tissue.matrix contains tissue information for each RNA-seq ID
 	suggested.tissue.name <- labels[i]
-	tissue.label <- c(tissue.label, suggested.tissue.name)
+	run.label <- c(run.label, suggested.tissue.name)
     }
-    best.cn <- min.cluster
+    best.cn <- min.cluster  # best cluster number
     best.mix.rate <- 0 # perfect mix rate is 1.0
+    # find the best cluster number that results in largest tissue homogeneity
     for (cn in seq(min.cluster, max.cluster, 1)) { # cn is number of clusters
         cut <- cutree(clusters, cn)
         mix.sum <- 0
         mix.count <- 0
         for (c in unique(cut)) { # each cluster
             sample.index <- (cut == c)
-            t <- tissue.label[sample.index]
-            mix.sum <- mix.sum + max(as.data.frame(table(t))$Freq)/sum(as.data.frame(table(t))$Freq)
+            t <- run.label[sample.index] # t is a list of tissue names in this cluster
+	    max.freq = max(as.data.frame(table(t))$Freq) # what is the frequency of the most frequent tissue name in this cluster?
+	    sum.freq = sum(as.data.frame(table(t))$Freq) # what is the total number of tissue names in this cluster?
+            mix.sum <- mix.sum + max.freq/sum.freq
             mix.count <- mix.count + 1
         }
-        mix.rate <- log10(length(tissue.label)/mix.count) * (mix.sum/mix.count)^8 # make sure high tissue homogeneity is much preferred. also make sure the cluster is not too small.
-        #cat(sprintf('get.optimal.number.of.clusters: %d\t%4.1f\t%4.2f\t%4.2f\n', cn, length(tissue.label)/mix.count, mix.sum/mix.count, mix.rate))
+        mix.rate <- log10(length(run.label)/mix.count) * (mix.sum/mix.count)^8 # make sure high tissue homogeneity is much preferred. also make sure the cluster is not too small.
+        cat(sprintf('In get.optimal.number.of.clusters: cluster number:%d\t%4.1f\tpercentage:%4.2f\tmix.rate:%4.2f\n', cn, length(run.label)/mix.count, mix.sum/mix.count, mix.rate))
         if (mix.rate > best.mix.rate) {
             best.mix.rate <- mix.rate
             best.cn <- cn
@@ -112,7 +115,7 @@ total.pair <- dim(target.tf)[1]
 
 cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))
 cat('Hclust ...\n')
-X2 <- normalize(X2) # each row of X2 has mean 0 and standard deviation 1.
+X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1.
 clusters <- hclust(dist(t(X2)), method = 'average')
 cat(sprintf('Determine optimal number of clusters ...\n'))
 tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
diff --git a/Code/refine_tissue.py b/Code/refine_tissue.py
index 8bc111c..182de1a 100644
--- a/Code/refine_tissue.py
+++ b/Code/refine_tissue.py
@@ -126,7 +126,7 @@ def map_tissue(s):
         'vein':'leaf.vein',
         'leaf vein':'leaf.vein',
         'leaf lamina':'leaf.lamina',                
-        'leaf rosette':'leaf rosette',
+        'leaf rosette':'leaf.rosette',
         'rosette':'leaf.rosette',
         'rosette leaf':'leaf.rosette',
         'shoot':'shoot',
@@ -286,9 +286,6 @@ for line in f:
     if lst[2] != 'part_unknown' and lst[2] != '.':
         s = lst[2].lower()
         s = map_tissue(remove_punctuation(s))
-    # elif lst[2] == '.':
-    #     s = lst[1]
-    #     s = remove_parenthese(s)
     else:
         s = lst[1]
         if not ';' in s: