summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2024-08-11 15:01:58 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2024-08-11 15:01:58 +0800
commit824629706b75b3c5b4275c2f6d6ce540bf6dfc73 (patch)
treeb0a153ac7241d024fb90f7d759d5074b08b23dd1
parent0070478f7f57ee86d6a0bc2d3550eb5a5e71a7ec (diff)
dos2unix
l---------Code/.#update_network_by_force.py1
-rw-r--r--Code/configure.py112
-rw-r--r--Code/merge_edges.py4
-rw-r--r--Code/requirements.txt1
4 files changed, 62 insertions, 56 deletions
diff --git a/Code/.#update_network_by_force.py b/Code/.#update_network_by_force.py
new file mode 120000
index 0000000..212abbe
--- /dev/null
+++ b/Code/.#update_network_by_force.py
@@ -0,0 +1 @@
+lanhui@lh-ubuntu22.3552:1722838551 \ No newline at end of file
diff --git a/Code/configure.py b/Code/configure.py
index 73fc9cc..2f0fbd9 100644
--- a/Code/configure.py
+++ b/Code/configure.py
@@ -1,56 +1,56 @@
-# From get_TPM_by_salmon.py
-SALMON = '/home/lanhui/brain/Salmon/Salmon-0.7.2_linux_x86_64/bin/salmon' # salmon software path
-SALMON_INDEX = '/home/lanhui/brain/Salmon/salmon_index'
-TRANSCRIPTOME = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
-SALMON_MAP_RESULT_DIR = '../Data/temp/salmon_map_result'
-KMER = 31
-
-# From download_and_map.py
-DAILY_MAP_NUMBER = 4 # download this many samples each time. I have tested the values of 3, 4, 5, 8.
-MIN_FASTQ_FILE_SIZE = 200000000 # in bytes, approximately 200MB
-RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json' # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq
-DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs
-IGNORED_SRA_ID_LOG_FILE = '../Data/log/download_log_small_sized_ids.txt' # store SRA IDs with small file size.
-MAPPED_RDATA_DIR = '../Data/R/Mapped/public' # mapped RNA-seq (file names ended with _quant.txt) go here
-RAW_RDATA_DIR = '/disk1/Data/R/Raw' # downloaded files go here, was "../Data/R/Raw" (now almost full).
-
-# From update_network.py
-# Don'T change the following paths and names
-HISTORY_DIR = '../Data/history/edges/many_targets' # each edge file contains edges for many targets
-HISTORY_DIR2 = '../Data/history/edges/one_target' # edges.txt.* files are here, all edge files have the name edges.txt.*, the leading string 'edges.txt' must be present.
-TIMESTAMP_FILE = '../Data/log/file_timestamp.txt' # record last modified time of several important files
-SAMPLE_SIZE_FILE = '../Data/log/total.samples.txt' # each line contains a date and the number of samples on and after that date
-TEMP_DIR = '../Data/temp'
-
-PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
-PARAMETER_FOR_BUILDRMATRIX = '../Data/parameter/parameter_for_buildRmatrix.txt'
-PARAMETER_FOR_NET = '../Data/parameter/parameter_for_net.txt'
-PARAMETER_FOR_NET_TRAVADB_STRESS = '../Data/parameter/parameter_for_net_travadb_stress.txt'
-PARAMETER_FOR_NET_TRAVADB_MAP = '../Data/parameter/parameter_for_net_travadb_map.txt'
-PARAMETER_FOR_NET_MILD_DROUGHT = '../Data/parameter/parameter_for_net_mild_drought.txt'
-PARAMETER_FOR_NET_WIGGELAB_DIURNAL = '../Data/parameter/parameter_for_net_wiggelab_diurnal.txt'
-
-BINDING_FILE = '../Data/history/bind/binding.txt'
-TPM_FILE = '../Data/history/expr/TPM.txt' # gene expression data
-
-BUILDRMATRIX_RENEW_INTERVAL = 28 # check every 28 days for updating TPM.txt
-MIN_RNA_SEQ_INCREASE = -999 # minimum RNA-seq experiments needed when updating parameter_for_buildRmatrix.txt
-UPDATE_NETWORK_LOG_FILE = '../Data/log/update.network.log.txt' # network update log. We should check this file from time to time.
-NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt'
-
-RNA_SEQ_INFO_DATABASE = '../Data/information/rnaseq_info_database.txt' # same as RNA_SEQ_INFO_FILE
-RNA_SEQ_INFO_DATABASE_JSON = '../Data/information/rnaseq_info_database.json'
-
-GENE_ID_FIRST_TWO_LETTERS = 'AT'
-MEMORY_STRENGTH = 365 # memory retention power (larger value means better memory)
-
-#
-MAPPED_CDATA_DIR = '../Data/C/Mapped' # mapped ChIp-seq data
-
-# Used in merge_edges.py
-EDGE_POOL_DIR = '/disk1/edge_pool'
-MERGED_EDGE_FILE = '../Data/temp/edges.txt'
-SQLITE_EDGE_FILE = '../Data/temp/edges.sqlite'
-DIFF_EDGE_FILE = '../Data/temp/edges-diff.txt' # the difference between two edge files from yesterday and from today
-
-TARGET_TF_FILE = '../Data/information/target_tf.txt'
+# From get_TPM_by_salmon.py
+SALMON = '/home/lanhui/brain/Salmon/Salmon-0.7.2_linux_x86_64/bin/salmon' # salmon software path
+SALMON_INDEX = '/home/lanhui/brain/Salmon/salmon_index'
+TRANSCRIPTOME = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
+SALMON_MAP_RESULT_DIR = '../Data/temp/salmon_map_result'
+KMER = 31
+
+# From download_and_map.py
+DAILY_MAP_NUMBER = 4 # download this many samples each time. I have tested the values of 3, 4, 5, 8.
+MIN_FASTQ_FILE_SIZE = 200000000 # in bytes, approximately 200MB
+RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json' # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq
+DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs
+IGNORED_SRA_ID_LOG_FILE = '../Data/log/download_log_small_sized_ids.txt' # store SRA IDs with small file size.
+MAPPED_RDATA_DIR = '../Data/R/Mapped/public' # mapped RNA-seq (file names ended with _quant.txt) go here
+RAW_RDATA_DIR = '/disk1/Data/R/Raw' # downloaded files go here, was "../Data/R/Raw" (now almost full).
+
+# From update_network.py
+# Don'T change the following paths and names
+HISTORY_DIR = '../Data/history/edges/many_targets' # each edge file contains edges for many targets
+HISTORY_DIR2 = '../Data/history/edges/one_target' # edges.txt.* files are here, all edge files have the name edges.txt.*, the leading string 'edges.txt' must be present.
+TIMESTAMP_FILE = '../Data/log/file_timestamp.txt' # record last modified time of several important files
+SAMPLE_SIZE_FILE = '../Data/log/total.samples.txt' # each line contains a date and the number of samples on and after that date
+TEMP_DIR = '../Data/temp'
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
+PARAMETER_FOR_BUILDRMATRIX = '../Data/parameter/parameter_for_buildRmatrix.txt'
+PARAMETER_FOR_NET = '../Data/parameter/parameter_for_net.txt'
+PARAMETER_FOR_NET_TRAVADB_STRESS = '../Data/parameter/parameter_for_net_travadb_stress.txt'
+PARAMETER_FOR_NET_TRAVADB_MAP = '../Data/parameter/parameter_for_net_travadb_map.txt'
+PARAMETER_FOR_NET_MILD_DROUGHT = '../Data/parameter/parameter_for_net_mild_drought.txt'
+PARAMETER_FOR_NET_WIGGELAB_DIURNAL = '../Data/parameter/parameter_for_net_wiggelab_diurnal.txt'
+
+BINDING_FILE = '../Data/history/bind/binding.txt'
+TPM_FILE = '../Data/history/expr/TPM.txt' # gene expression data
+
+BUILDRMATRIX_RENEW_INTERVAL = 28 # check every 28 days for updating TPM.txt
+MIN_RNA_SEQ_INCREASE = -999 # minimum RNA-seq experiments needed when updating parameter_for_buildRmatrix.txt
+UPDATE_NETWORK_LOG_FILE = '../Data/log/update.network.log.txt' # network update log. We should check this file from time to time.
+NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt'
+
+RNA_SEQ_INFO_DATABASE = '../Data/information/rnaseq_info_database.txt' # same as RNA_SEQ_INFO_FILE
+RNA_SEQ_INFO_DATABASE_JSON = '../Data/information/rnaseq_info_database.json'
+
+GENE_ID_FIRST_TWO_LETTERS = 'AT'
+MEMORY_STRENGTH = 365 # memory retention power (larger value means better memory)
+
+#
+MAPPED_CDATA_DIR = '../Data/C/Mapped' # mapped ChIp-seq data
+
+# Used in merge_edges.py
+EDGE_POOL_DIR = '../Data/history/edge_pool'
+MERGED_EDGE_FILE = '../Data/temp/edges.txt'
+SQLITE_EDGE_FILE = '../Data/temp/edges.sqlite'
+DIFF_EDGE_FILE = '../Data/temp/edges-diff.txt' # the difference between two edge files from yesterday and from today
+
+TARGET_TF_FILE = '../Data/information/target_tf.txt'
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
index 6bbd2f0..872faa9 100644
--- a/Code/merge_edges.py
+++ b/Code/merge_edges.py
@@ -23,6 +23,7 @@
import os, operator, sys, math, datetime, glob
from log import write_log_file
from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE, SQLITE_EDGE_FILE, UPDATE_NETWORK_LOG_FILE
+from utils import make_paths
import sqlite3
def get_number_of_RNAseq_ids(s):
@@ -134,6 +135,9 @@ def make_new_edge(d):
##main
+
+make_paths(EDGE_POOL_DIR)
+
write_log_file('[merge_edges.py] Go through all edge files in the edge pool %s.' % (EDGE_POOL_DIR) , UPDATE_NETWORK_LOG_FILE)
d = {} # d will contain all edges computed so far, where the key is TargetGeneID_TFGeneID, and the value is a list of tuples. Each tuple is a historical edge.
file_count = 0
diff --git a/Code/requirements.txt b/Code/requirements.txt
new file mode 100644
index 0000000..4d07dfe
--- /dev/null
+++ b/Code/requirements.txt
@@ -0,0 +1 @@
+networkx