Code/configure.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

# From get_TPM_by_salmon.py
SALMON          = '/home/lanhui/brain/Salmon/Salmon-0.7.2_linux_x86_64/bin/salmon' # salmon software path
SALMON_INDEX    = '/home/lanhui/brain/Salmon/salmon_index'
TRANSCRIPTOME   = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
SALMON_MAP_RESULT_DIR = '../Data/temp/salmon_map_result'
KMER            = 31

# From download_and_map.py
DAILY_MAP_NUMBER = 2   # download this many samples each time.  I have tested the values of 3, 4, 5, 8.
MIN_FASTQ_FILE_SIZE = 200000000    # in bytes, approximately 200MB
RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json'  # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq
DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs
IGNORED_SRA_ID_LOG_FILE = '../Data/log/download_log_small_sized_ids.txt'  # store SRA IDs with small file size.
MAPPED_RDATA_DIR = '../Data/R/Mapped/public'          # mapped RNA-seq (file names ended with _quant.txt) go here
RAW_RDATA_DIR    = '../Data/R/Raw'                       # downloaded files go here


# From update_network.py
# Don'T change the following paths and names
HISTORY_DIR       = '../Data/history/edges/many_targets' # each edge file contains edges for many targets
HISTORY_DIR2      = '../Data/history/edges/one_target'   # edges.txt.* files are here, all edge files have the name edges.txt.*, the leading string 'edges.txt' must be present.
FILE_TIMESTAMP    = '../Data/log/file_timestamp.txt'     # record last modified time of several important files
SAMPLE_SIZE_FILE  = '../Data/log/total.samples.txt'      # each line contains a date and the number of samples on and after that date
TEMP_DIR          = '../Data/temp'

PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
PARAMETER_FOR_BUILDRMATRIX = '../Data/parameter/parameter_for_buildRmatrix.txt'
PARAMETER_FOR_NET          = '../Data/parameter/parameter_for_net.txt'
PARAMETER_FOR_NET_TRAVADB_STRESS      = '../Data/parameter/parameter_for_net_travadb_stress.txt'
PARAMETER_FOR_NET_TRAVADB_MAP         = '../Data/parameter/parameter_for_net_travadb_map.txt'
PARAMETER_FOR_NET_MILD_DROUGHT        = '../Data/parameter/parameter_for_net_mild_drought.txt'
PARAMETER_FOR_NET_WIGGELAB_DIURNAL    = '../Data/parameter/parameter_for_net_wiggelab_diurnal.txt'

BINDING_FILE               = '../Data/history/bind/binding.txt'
TPM_FILE                   = '../Data/history/expr/TPM.txt' # gene expression data

PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL = 18 # check every 18 days for updating TPM.txt
MIN_RNA_SEQ_INCREASE = -10000 # minimum RNA-seq experiments needed when updating parameter_for_buildRmatrix.txt
UPDATE_NETWORK_LOG_FILE  = '../Data/log/update.network.log.txt' # network update log. We should check this file from time to time.
NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt'

RNA_SEQ_INFO_DATABASE   = '../Data/information/rnaseq_info_database.txt' # same as RNA_SEQ_INFO_FILE
RNA_SEQ_INFO_DATABASE_JSON   = '../Data/information/rnaseq_info_database.json'

GENE_ID_FIRST_TWO_LETTERS = 'AT'
MEMORY_STRENGTH = 365 # strength of memory, larger value means better memory

#
MAPPED_CDATA_DIR = '../Data/C/Mapped' # mapped ChIp-seq data

# Used in merge_edges.py
EDGE_POOL_DIR = '/disk1/edge_pool'
MERGED_EDGE_FILE = '../Data/temp/edges.txt'

DIFF_EDGE_FILE = '../Data/temp/edges-diff.txt' # the difference between two edge files from yesterday and from today

TARGET_TF_FILE = '../Data/information/target_tf.txt'