# Usage: python dowload_and_map.py
#        python download_and_map.py run_ids.txt
#
#        Edit DAILY_DOWNLOAD_NUMBER and MIN_FILE_SIZE
#
# This program checks RNA_SEQ_INFO_FILE for not yet downloaded, *public* RNA-seq data, make a list of them, download and map using Salmon.  It is very important to prepare
# RNA_SEQ_INFO_FILE (see parse_end_xlm.py).  In fact, only first column of RNA_SEQ_INFO_FILE is required in this file, that is a list of RNA-seq IDs.
#
# Purpose: automate downloading RNA-seq files from ENA or SRA.  This program checks MAPPED_RDATA_DIR and RAW_RDATA_DIR to ensure that we are not re-mapping or re-downloading already mapped or downloaded data.
#
# Note: use download_data2() to download from SRA, and download_and_map_data() to download from ENA (closer to Cambridge so faster).  This script depends on get_TPM_by_salmon.py
#
# 23 DEC 2016, hui, slcu. Updated: 9 Feb 2017
# Last modified 10 APR 2017, hui, slcu
# Last reviewed 31 July 2018

import os, sys, glob, json
import fnmatch
import time
import re
from datetime import datetime

##########################################################################################
from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, UPDATE_NETWORK_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR

FASTQ_DUMP_PATH = '/home/hui/software/sratoolkit/sratoolkit.2.8.0-ubuntu64/bin/fastq-dump'

##########################################################################################

def glob_files(directory, pattern):
    ''' return all file names (without paths) given directory and pattern '''
    result = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            result.append(filename)
    return result


def glob_files_include_path(directory, pattern):
    ''' return all file names (with paths) given directory and pattern '''
    result = []

    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            result.append(os.path.join(root, filename))
    # all check *.txt files where downloaded files are recorded.
    for fname in glob.glob(os.path.join(directory, 'download*.txt')):
        f = open(fname)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            if fnmatch.fnmatch(os.path.basename(line), pattern):
                result.append(os.path.join(directory, line))
    return result


def get_list(fname):
    ''' Convert a file to a list, each line is an element in the list '''
    if not os.path.exists(fname):
        return []
    
    result = []
    f = open(fname)
    d = {}
    for line in f:
        line = line.strip()
        lst = line.split()
        s = lst[0].strip() # SRR, ERR, or DRR id
        if (not s in d) and ('SRR' in s or 'ERR' in s or 'DRR' in s):
            d[s] = 1
            result.append(s)
    f.close()
    return result # only return unique elements


def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
    ''' 
    Make next n sample IDs.  These samples must have not been downloaded yet.  

    all_run_ids - a list of NextGen-Seq IDs to select from
    mapped_dir - contain all mapped samples
    rna_data_info_dict - a dictionary containing all RNA-seq samples from ENA.
    '''

    result = []
    mapped_files = glob_files(mapped_dir, '*_quant.txt')
    mapped_run_ids = get_list(DOWNLOADED_SRA_ID_LOG_FILE)
    small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these files are too small
    for x in sorted(all_run_ids, reverse=True): # SRR first, then ERR, then DRR
        include_me = True if x in rna_data_info_dict and rna_data_info_dict[x] > 0 else False
        if not (x + '_quant.txt') in mapped_files and not x in result and (not x in small_ids) and (not x in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
            result.append(x)
    return result


def num_of_digits(s):
    count = 0
    for c in s:
        if c.isdigit():
            count += 1
    return count


def get_file_url(fname):
    ''' for wget '''
    f = open(fname)
    url_list = []
    for line in f:
        line = line.strip()
        if 'ftp://' in line and '.fastq.gz' in line:
            lst  = line.split()
            address = lst[-1].strip()
            if '.fastq.gz' in address and address.startswith('ftp://') and not address in url_list:
                url_list.append(address)
    f.close()
    return url_list


def get_file_size(fname):
    sz = 0
    f = open(fname)
    for line in f:
        line = line.strip()
        if line.startswith('==> SIZE'):
            lst = line.split()
            sz =  int(lst[-1])
    f.close()
    return sz


def get_remote_file_size(link):
    cmd = 'rm -f ../Data/temp/wget_temp_file1.txt'
    os.system(cmd)
    
    cmd = 'wget --spider %s 2> ../Data/temp/wget_temp_file1.txt' % (link)
    os.system(cmd)
    return get_file_size('../Data/temp/wget_temp_file1.txt')


def get_sample_id(fname):
    ''' extra id from file name'''
    index = fname.find('.fastq.gz')
    if index < 0:
        return ''

    s = fname[0:index]
    lst = s.split('_')
    return lst[0]


def download_and_map_data(lst, daily_map_num, dest):
    ''' Download data from ENA; fast (but can be interruptive) '''
    downloaded_files = [] # a list of paths to downloaded files, small files (size less than MIN_FASTQ_FILE_SIZE) won't be included in the list
    map_list = []

    if len(lst) < daily_map_num or daily_map_num < 1:
        return downloaded_files, map_list
    
    count = 0
    for line in lst: # lst - a list of run IDs
        run_id = line
        dir1 = line[0:6]
        dir2 = ''
        n = num_of_digits(line)
        address = ''
        if n == 6: # follow ENA's data path convention
            address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/' % (dir1, run_id)
        elif n == 7:
            dir2 = '00' + run_id[-1]
            address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s/' % (dir1, dir2, run_id)
        elif n == 8:
            dir2 = '0' + run_id[-2:]
            address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s/' % (dir1, dir2, run_id)
        elif n == 9:
            dir2 = run_id[-3:]
            address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s/' % (dir1, dir2, run_id)

        if os.path.exists('../Data/temp/wget_temp_file0.txt'):
            cmd = 'rm -f ../Data/temp/wget_temp_file0.txt'
            os.system(cmd)

        cmd = 'wget --spider -T 20 %s 2> ../Data/temp/wget_temp_file0.txt' % (os.path.join(address, '*.gz'))
        os.system(cmd)

        url_lst = get_file_url('../Data/temp/wget_temp_file0.txt')
        if url_lst == []:
            write_download_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')            

        time.sleep(1)

        curr_lst = []
        for link in url_lst:
            sz = get_remote_file_size(link)
            if  sz >= MIN_FASTQ_FILE_SIZE:  # remote file must be big enough
                cmd = 'wget %s -P %s' % (link, dest)
                os.system(cmd)
                file_path = os.path.join(dest, os.path.basename(link))
                curr_lst.append(file_path)
                downloaded_files.append(file_path)
            else:
                print('[download_and_map.py] IGNORE [%d MB] %s' % (int(sz/1000000.0), link))
                file_name = os.path.basename(link)
                sample_id = get_sample_id(file_name)
                write_download_log_file(IGNORED_SRA_ID_LOG_FILE, sample_id+'\n')


        print(curr_lst)
        if curr_lst != []:
            salmon_map(curr_lst)
            map_list.append(run_id)            
            count += 1

        # Remove raw files (as they occupy lots of space)
        for f in downloaded_files:
            if os.path.exists(f):
                print('[download_and_map.py] To save space, I am removing %s.' % (f))
                os.remove(f)
                time.sleep(1)

        if count >= daily_map_num:
            return downloaded_files, map_list
                
        time.sleep(3)

    return  downloaded_files, map_list


def download_data2(lst, dest):
    ''' Download data from SRA, slow '''
    if not os.path.exists(FASTQ_DUMP_PATH):
        print('%s not exists.' % (FASTQ_DUMP_PATH))
        sys.exit()
        
    downloaded_files = [] # a list of paths to downloaded files, small files (size less than MIN_FASTQ_FILE_SIZE) won't be downloaded
    for line in lst:
        run_id = line.strip()
        cmd = '%s -N 1000000 --split-files --skip-technical %s' % (FASTQ_DUMP_PATH, run_id)
        print('\n' + cmd)
        os.system(cmd)
        if glob.glob('%s*fastq*' % (run_id)) != []: # files are successfully downloaded
            cmd = 'mv %s*fastq* %s' % (run_id, dest)
            print(cmd)
            os.system(cmd)
            fastq_file_lst = glob.glob( os.path.join(dest, '%s*fastq*' % (run_id)) )
            if len(fastq_file_lst) == 1: # rename file
                cmd = 'mv %s %s' % (fastq_file_lst[0], os.path.join(dest, run_id+'.fastq'))
                os.system(cmd)

            cmd = 'gzip %s' % (  os.path.join(dest, run_id + '*.fastq') )
            print(cmd)
            os.system(cmd)
            for fname in glob.glob( os.path.join(dest, '%s*gz' % (run_id)) ) :
                downloaded_files.append(fname)
        else:
            write_download_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')

    return  downloaded_files


def salmon_map(lst):
    gz_file = '../Data/temp/gz_files.txt'
    if os.path.exists(gz_file):
        cmd = 'rm -f %s' % (gz_file) # remove old parameter file (if any). gz means gzip.  fastq files are usually zipped.
        os.system(cmd)
    
    f = open('../Data/temp/gz_files.txt', 'w')
    f.write('\n'.join(lst)) # lst contains paths to fastq files
    f.close()

    print('Start mapping %s' % ('  '.join(lst)))
    cmd = 'python get_TPM_by_salmon.py ../Data/temp/gz_files.txt > /dev/null 2>&1' # mapped files will be saved in result
    os.system(cmd)


def write_download_log_file(fname, s):
    if not os.path.exists(fname):
        f = open(fname, 'w')
    else:
        f = open(fname, 'a')
    f.write(s)
    f.close()


def write_network_log_file(s, fname):
    f = open(fname, 'a')
    curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
    s = '[' + curr_time + ']: ' + s
    if not '\n' in s:
        s += '\n'
    f.write(s)
    f.close()


def last_session_finished(fname):
    ''' return true if log file ends with DONE. '''
    if not os.path.exists(fname):
        return True
    f = open(fname)
    lines = f.readlines()
    f.close()
    last_line = lines[-1]
    if last_line.strip() == '': # a newline
        print('[download_and_map.py] Last line in file %s is empty.  The last line must start with DONE.' % (fname))
        sys.exit()
    lst = last_line.split()
    if lst[0] == 'DONE':
        return True
    else:
        return False

def read_ena_data_info(fname):
    d = {}
    f = open(fname)
    for line in f:
        line = line.strip()
        lst = line.split()
        run_id = lst[0]
        d[run_id] = 1
    f.close()
    return d


def read_ena_data_info_json(fname):
    d = {}
    with open(fname) as json_data:
        json_dict = json.load(json_data)
        for run_id in json_dict:
           d[run_id] = 1
    return d


def read_run_ids_from_file(fname):
    f = open(fname)
    lst = []
    for line in f:
        line = line.strip()
        lst = line.split()
        if not line.startswith('#') and 'RR' in line:
            lst.append(lst[0])
    f.close()
    return list(set(lst))



    
## main

# For filtering RNA-seq data
if not os.path.exists(RNA_SEQ_INFO_FILE):
    print('[download_and_map.py] Must provide %s. See parse_ena_xml.py about how to make it.' % (RNA_SEQ_INFO_FILE))
    sys.exit()

# If there is no enough disk space for storing the downloaded sequencing data, then stop
available_G = 4 * os.statvfs('/home').f_bavail / (1024*1024) # compute available space (in G).  Each block has 4k bytes, work for Linux/UNIX systems only
if available_G < 3 * DAILY_MAP_NUMBER:
    print('[download_and_map.py] home directory does not have enough space (only %d G available) ' % (available_G))
    write_network_log_file('[download_and_map.py] home directory does not have enough space (only %d G available).' % (available_G), UPDATE_NETWORK_LOG_FILE)
    sys.exit()

if not last_session_finished(DOWNLOADED_SRA_ID_LOG_FILE): # last session not finished
    print('[download_and_map.py] last downloading and mapping session not finished yet. You can edit file %s to remove last START at.' % (DOWNLOADED_SRA_ID_LOG_FILE))
    sys.exit()

rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE) # rna_data_info_dict contains only RNA-seq IDs.

# Generate DRR/ERR/SRR ids to download
if len(sys.argv) > 1:  # user has provided a list of IDs in a file
    download_list = read_run_ids_from_file(sys.argv[1])
    DAILY_MAP_NUMBER = len(download_list)
else:
    print('[download_and_map.py] Prepare download list ...')
    download_list = make_download_list(rna_data_info_dict.keys(), MAPPED_RDATA_DIR, rna_data_info_dict)
    print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER))


# Make a record in log.txt
curr_time = datetime.now().strftime('%Y-%m-%d_%H%M') # append date info to newly created directories
write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'START at %s\n' % (curr_time))

# Download these RNA-seq IDs and map them using salmon 
print('[download_and_map.py] Start downloading and mapping ...')
downloaded_file_paths, map_list = download_and_map_data(download_list, DAILY_MAP_NUMBER, RAW_RDATA_DIR) # or we can use the function download_data2 to download from SRA (in US).

# Move all files to MAPPED_RDATA_DIR
curr_time = datetime.now().strftime('%Y-%m-%d_%H%M') # append date info to newly created directories
new_dir_name = MAPPED_RDATA_DIR
if not os.path.isdir(new_dir_name):
    os.makedirs(new_dir_name)

# after mapping is finished, move all resulting files to new_dir_name (MAPPED_RDATA_DIR)
if glob.glob('%s/*_quant.txt' % (SALMON_MAP_RESULT_DIR.rstrip('/'))) != []:
    cmd = 'mv %s/*_quant.txt %s' % (SALMON_MAP_RESULT_DIR.rstrip('/'), new_dir_name)
    os.system(cmd)
    print('[download_and_map.py] Done. Check directory %s.' % (os.path.abspath(new_dir_name)))
else:
    print('[download_and_map.py] No quant files to move.')


write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, '%s\n' % ('\n'.join(map_list)))
write_download_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'DONE at %s\n' % (curr_time))