summaryrefslogtreecommitdiff
path: root/Code/delete_not_used_fastq.py
blob: 67d368b3db3318a0a729f0e6a203409fb623d8c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# Usage: python delete_not_used_fastq.py
#        Edit DIR, the directory containing all fastq.gz files.
#        Also generate USED_IDS.
# Purpose: list fastq.gz files that are not used. Move them to to.be.deleted folder.
#
# 20 Apr 2017, slcu, hui

import glob, os

def read_ids(fname):
    f = open(fname)
    d = {}
    for line in f:
        line = line.strip()
        d[line] = 1
    f.close()
    return d

DIR = '/home/hui/network/R/Raw'
destDIR = os.path.join(DIR, 'to.be.deleted')
USED_IDS = '/home/hui/network/v03/Data/temp/used.sra.ids.txt' # generated by grep @ /home/hui/network/v03/Data/parameter/parameter_for_buildRmatrix.txt | grep 'SRR\|ERR\|DRR' | perl -pe 'substr($_, 0, 3) = ""; s/X+$//'


if not os.path.isdir(destDIR):
    os.makedirs(destDIR)
    
ids = read_ids(USED_IDS)
flst = glob.glob(os.path.join(DIR, '*.gz'))

print('file\tsize.in.G')
sum = 0
for path in flst:
    fname = os.path.basename(path)
    if '_' in fname:
        i = fname[0:fname.find('_')]
    else:
        i = fname[0:fname.find('.')]
    if not i in ids:
        print('%s\t%4.2f' % (path, 1.0*os.path.getsize(path)/(1024*1024*1024)))
        sum += 1.0*os.path.getsize(path)/(1024*1024*1024)
        cmd = 'mv %s %s' % (path, destDIR)
        os.system(cmd)
print('Total %4.2f G moved to %s' % (sum, destDIR))