diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2022-11-05 22:16:57 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2022-11-05 22:16:57 +0800 |
commit | 4d2b4aa5a97cc8f640a7b934298c871dbaea18d2 (patch) | |
tree | ba10eb723f737c2d4e281f795c75437847ee4b7a | |
parent | d3459851212333812f925da65de6def5be5a682d (diff) |
Make correlation_per_group_fixed_number.R work, and update its upstream scripts (i.e., assign_tissue.py and refine_tissue.py).
-rw-r--r-- | Code/assign_tissue.py | 50 | ||||
-rw-r--r-- | Code/correlation_per_group_fixed_number.R | 23 | ||||
-rw-r--r-- | Code/refine_tissue.py | 5 | ||||
-rw-r--r-- | brain.documentation/QUICKSTART.rst | 9 |
4 files changed, 46 insertions, 41 deletions
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py index 782ab78..4a81e18 100644 --- a/Code/assign_tissue.py +++ b/Code/assign_tissue.py @@ -6,9 +6,9 @@ # # 2 June 2017, slcu, hui # Last modified 19 June 2017, slcu, hui +# Last modified 5 November 2022, hui import os, sys, json -import urllib2 def make_tissue_dict(fname): f = open(fname) @@ -42,7 +42,7 @@ def make_sample_dict(fname): lines = f.readlines() f.close() d = {} - for line in lines[1:]: + for line in lines[1:]: # skip the head line line = line.strip() lst = line.split('\t') if len(lst) >= 2: @@ -94,24 +94,23 @@ def stringfy_json(d): def make_information(s, info_dir): lst = s.split('...') - sample_id = lst[0] + sample_id = lst[0] # looks like SAMD00012930 filename = '%s/%s.json' % (info_dir, sample_id) - #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id) if not os.path.exists(filename): - cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename) + #cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename) + cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/samples/%s.json > %s' % (sample_id, filename) os.system(cmd) f = open(filename) d = json.load(f) f.close() - #d = json.load(urllib2.urlopen(url)) - if len(d["_embedded"]["samples"]) > 0: - return stringfy_json(d["_embedded"]["samples"][0]) + if 'tissue' in d['characteristics']: + return d['characteristics']['tissue'][0]['text'].lower() + '\t' + sample_id else: - return '.\t.' + return '.' + '\t' + sample_id # main -BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here +BIOSAMPLE_INFO_DIR = '../Data/information/BioSample' # put downloaded BioSample json files here if not os.path.isdir(BIOSAMPLE_INFO_DIR): os.makedirs(BIOSAMPLE_INFO_DIR) @@ -122,18 +121,21 @@ os.system(cmd) lst = get_experiment_id('../Data/temp/a.txt') d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description) -d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run +d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt.temp') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run head = '' -for x in lst: - k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX - s = x - if k in d: - s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8') - elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data - s += '\t' + 'seedling\t' + '\t'.join(8*['.']) - elif x.startswith('R0002'): # pcubas (Spain) data - s += '\t' + 'meristem\t' + '\t'.join(8*['.']) - else: - s += '\t' + '\t'.join(9*['.']) # k is gene_id - print(s) - head += s.replace('\t', '_') + '\t' +# run.id inferred.tissue biosample.tissue biosample.id +with open('../Data/temp/experiment.and.tissue.1.txt', 'w', encoding='utf8') as f: + for x in lst: + s = x # run id, such as R0DRR016125XXX + k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX + if k in d: + s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0], BIOSAMPLE_INFO_DIR) + elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data + s += '\t' + 'seedling\t' + '\t'.join(8*['.']) + elif x.startswith('R0002'): # pcubas (Spain) data + s += '\t' + 'meristem\t' + '\t'.join(8*['.']) + else: + s += '\t' + '\t'.join(9*['.']) # k is gene_id + f.write(s + '\n') + head += s.replace('\t', '_') + '\t' + diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R index 3f48220..3407973 100644 --- a/Code/correlation_per_group_fixed_number.R +++ b/Code/correlation_per_group_fixed_number.R @@ -7,7 +7,7 @@ # the groups and tissue label. More specifically, within each group # there should be as few distinct tissues as possible. -TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt' +TISSUE.FILE <- '../Data/information/experiment.and.tissue.2.txt' DATA.FILE <- '../Data/history/expr/TPM.txt' TARGET.TF.FILE <- '../Data/information/target_tf.txt' AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt' @@ -70,30 +70,33 @@ normalize <- function(X) { # Choose the optimal number of clusters such that they have best agreement with tissue labels -# Added on 28 June 2017, slcu, hui +# Added on 28 June 2017, slcu, hui, last modified on 5 Nov 2022, hui get.optimal.number.of.clusters <- function(X, clusters, tissue.matrix, min.cluster, max.cluster) { labels <- as.vector(tissue.matrix$suggested.tissue) labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories - tissue.label <- c() + run.label <- c() # what tissue does each run experiment come from? for (rseqid in colnames(X)) { # X is the gene expression matrix i <- which(as.vector(tissue.matrix$run.id) == rseqid) # tissue.matrix contains tissue information for each RNA-seq ID suggested.tissue.name <- labels[i] - tissue.label <- c(tissue.label, suggested.tissue.name) + run.label <- c(run.label, suggested.tissue.name) } - best.cn <- min.cluster + best.cn <- min.cluster # best cluster number best.mix.rate <- 0 # perfect mix rate is 1.0 + # find the best cluster number that results in largest tissue homogeneity for (cn in seq(min.cluster, max.cluster, 1)) { # cn is number of clusters cut <- cutree(clusters, cn) mix.sum <- 0 mix.count <- 0 for (c in unique(cut)) { # each cluster sample.index <- (cut == c) - t <- tissue.label[sample.index] - mix.sum <- mix.sum + max(as.data.frame(table(t))$Freq)/sum(as.data.frame(table(t))$Freq) + t <- run.label[sample.index] # t is a list of tissue names in this cluster + max.freq = max(as.data.frame(table(t))$Freq) # what is the frequency of the most frequent tissue name in this cluster? + sum.freq = sum(as.data.frame(table(t))$Freq) # what is the total number of tissue names in this cluster? + mix.sum <- mix.sum + max.freq/sum.freq mix.count <- mix.count + 1 } - mix.rate <- log10(length(tissue.label)/mix.count) * (mix.sum/mix.count)^8 # make sure high tissue homogeneity is much preferred. also make sure the cluster is not too small. - #cat(sprintf('get.optimal.number.of.clusters: %d\t%4.1f\t%4.2f\t%4.2f\n', cn, length(tissue.label)/mix.count, mix.sum/mix.count, mix.rate)) + mix.rate <- log10(length(run.label)/mix.count) * (mix.sum/mix.count)^8 # make sure high tissue homogeneity is much preferred. also make sure the cluster is not too small. + cat(sprintf('In get.optimal.number.of.clusters: cluster number:%d\t%4.1f\tpercentage:%4.2f\tmix.rate:%4.2f\n', cn, length(run.label)/mix.count, mix.sum/mix.count, mix.rate)) if (mix.rate > best.mix.rate) { best.mix.rate <- mix.rate best.cn <- cn @@ -112,7 +115,7 @@ total.pair <- dim(target.tf)[1] cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau)) cat('Hclust ...\n') -X2 <- normalize(X2) # each row of X2 has mean 0 and standard deviation 1. +X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1. clusters <- hclust(dist(t(X2)), method = 'average') cat(sprintf('Determine optimal number of clusters ...\n')) tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t') diff --git a/Code/refine_tissue.py b/Code/refine_tissue.py index 8bc111c..182de1a 100644 --- a/Code/refine_tissue.py +++ b/Code/refine_tissue.py @@ -126,7 +126,7 @@ def map_tissue(s): 'vein':'leaf.vein', 'leaf vein':'leaf.vein', 'leaf lamina':'leaf.lamina', - 'leaf rosette':'leaf rosette', + 'leaf rosette':'leaf.rosette', 'rosette':'leaf.rosette', 'rosette leaf':'leaf.rosette', 'shoot':'shoot', @@ -286,9 +286,6 @@ for line in f: if lst[2] != 'part_unknown' and lst[2] != '.': s = lst[2].lower() s = map_tissue(remove_punctuation(s)) - # elif lst[2] == '.': - # s = lst[1] - # s = remove_parenthese(s) else: s = lst[1] if not ';' in s: diff --git a/brain.documentation/QUICKSTART.rst b/brain.documentation/QUICKSTART.rst index 4a5d95f..280def8 100644 --- a/brain.documentation/QUICKSTART.rst +++ b/brain.documentation/QUICKSTART.rst @@ -7,7 +7,7 @@ B R A I N :Version: 0.1.6 of 2022-10-09
-*Revision history: 9 June 2017, 22 June 2017, 4 July 2017, 11 July 2017 (created Section Drawing sub-networks), 18 July 2017 (created Section Uploading peak files), 10 October 2017 ('Enter a set of genes' under Section Exploring the network; created Section uRNA), 24 Jan 2018 (Enter 'AT1G24260 AGL9;SEP3 tissue specific tau 1.9' under Section Exploring the network), 3 August 2019 (design considerations), 5 August 2019 (updated create_edges0.py, added a section FAQs), 7 August 2019 (files needed for drawing a scatterplot online), 11 August 2019 (added information for correlation_per_group_fixed_number.R), 22 November 2019 (review and revise the entire content of this documentation), 4 April 2020 (use html_network.py to generate static html pages that contain a gene's regulators and regulatees; use cron job to copy these files to the web application), 6 February 2021 (write a section describing the running environment for brain), 9 October 2022 (remove PARAMETER_FOR from PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL, update section Automating downloading and updating the network)*
+*Revision history: 9 June 2017, 22 June 2017, 4 July 2017, 11 July 2017 (created Section Drawing sub-networks), 18 July 2017 (created Section Uploading peak files), 10 October 2017 ('Enter a set of genes' under Section Exploring the network; created Section uRNA), 24 Jan 2018 (Enter 'AT1G24260 AGL9;SEP3 tissue specific tau 1.9' under Section Exploring the network), 3 August 2019 (design considerations), 5 August 2019 (updated create_edges0.py, added a section FAQs), 7 August 2019 (files needed for drawing a scatterplot online), 11 August 2019 (added information for correlation_per_group_fixed_number.R), 22 November 2019 (review and revise the entire content of this documentation), 4 April 2020 (use html_network.py to generate static html pages that contain a gene's regulators and regulatees; use cron job to copy these files to the web application), 6 February 2021 (write a section describing the running environment for brain), 9 October 2022 (remove PARAMETER_FOR from PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL, update section Automating downloading and updating the network), 5 November 2022 (update assign_tissue.py, refine_tissue.py, correlation_per_group_fixed_number.R)*
.. contents::
@@ -1033,7 +1033,7 @@ Assign tissue names to RNA-seq samples We can assign each column (RNA-seq sample) in TPM.txt with a tissue name. If the RNA-seq sample's tissue is known, use it. Otherwise, predict it. The tissue information is used for displaying scatterplots, and for computing tissue-specific correlation coefficients (tissue.specific.correlation_). Follow the following three steps to get tissue names for RNA-seq samples:
-- python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt.
+- python assign_tissue.py. This script writes to ../Data/temp/experiment.and.tissue.1.txt.
- python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt
@@ -1041,7 +1041,10 @@ We can assign each column (RNA-seq sample) in TPM.txt with a tissue name. If th - python update_rnaseq_info_json.py
-This script will update Data/information/rnaseq_info_database.json, which is used to display scatterplots. It will also call knn_classify.R to predict tissues for unknown RNA-seq samples. The result is saved in Data/information/experiment.and.tissue.txt.
+ This script will update Data/information/rnaseq_info_database.json,
+ which is used to display scatterplots. It will also call
+ knn_classify.R to predict tissues for unknown RNA-seq samples. The
+ result is saved in Data/information/experiment.and.tissue.txt.
Correlation and sample size
|