Code/make_parameter_bw.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

# Usage: python make_parameter_bw.py
#        Edit the variable BW_PATHS, which is a file containing (almost all) paths to in-house bw files. Edit PARENT_DIR, which
#        will be used to make full path to the bw files. Amend name_map_dict.
# Purpose
# -------
#       Make a parameter file for buildCmatrix.py. For example:
#
#	@C0001100007141
#	PROTEIN_ID: 
#	PROTEIN_NAME:HTR13
#	DATA_NAME:2_71C_10_IP-HTR13-17c
#	DATA_FORMAT:bw
#	DESCRIPTION:in house ChIP data
#	LOCATION:/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data/2_71C/20150603_H3_2nd_rep_mapped_bw/ChIP-seq_h31_h33_col_rep1_20150607_max_fragment_500_bw/2_71C_10_IP-HTR13-17c_raw_trimmo_paired_truseq3-PE-2_2_10_5_1_bowtie2_TAIR10_ensembl_nomixed_sorted_rmdup_picard_genomenorm.bw
#	NOTE:
#
# 29 NOV 2016, hui, home

import sys, glob, os, operator
from geneid2name import make_gene_name_AGI_map_dict

BW_PATHS = 'bwfiles_unique.txt'  # Ideally, the input file should be sorted by library number, then sample number.
PARENT_DIR =  '/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data'

GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'

def get_library_id(s):
    index = s.find('C') # in-house chip library id ends with a letter C
    if index > 0:
        return s[:index]
    else:
        return s

def get_bw_file_name(lst):
    for x in lst:
        if '.bw' in x:
            return x
    return ''

def get_sample_name(s):
    index = s.find('_raw') # a bw file name contains _raw, get the part before _raw if _raw exists
    if index > 0:
        return s[:index]
    else:
        return s

def get_sample_number(s):
    index = s.find('_S') # sample number is proceeded by _S
    if index > 0:
        if s[index+2].isdigit() and s[index+3].isdigit(): # two digit sample number
            return s[index+2:index+4]
        else: # one digit sample number
            return '0' + s[index+2]
    return '25'

def convert_name(s, d):
    ''' convert s to gene id '''
    if s.upper() in d:
        return d[s.upper()]
    else:
        return ' '

def get_gene_id_and_name(s, d):
    ''' Return a dictionary value, protein name and protein id given a sample file name.'''
    for k in sorted(d.keys(),reverse=True):
        if k.isdigit() and k.lower() in s.lower(): # k is a number, e.g., 833
            return (d[k], convert_name(d[k], agi2name_dict))
        if k.lower() in s.lower():
            return (k, convert_name(k, agi2name_dict))
    return ('name_unknown', 'id_unknown')

###

# key is a name of sample, value is either empty if it is a protein name, or the protein name if the key is a database number, e.g., 833.  If key is protein name, the program tries to find its gene id (AT...).  If key is a database number, the program gets its protein name and then tries to find its gene id. If not found, PROTEIN_ID or PROTEIN_NAME will be assigned _unknown.  So search _unknown to manually annotate PROTEIN_ID or PROTEIN_NAME.
name_map_dict = {'HTA11':'', 'H3':'', 'HTR13':'', 'HSP70':'', 'KIN10':'', 'EPM1':'', 'ELF3':'', 'phyB':'', '833':'KIN10', 'hos':'', 'HOS':'', 'HOS1':'', 'LUX':'', 'YHB':'','HSF1':'','3129':'HSP90','838':'phyB', '1506':'HSF1', 'SUMO':'', 'TIR1':'', 'PIF4':'', 'PIF':'','H2A':'', 'H2AZ':'', 'MNase':'', '544':'ELF4', '834':'PIF4', '745':'ELF3', 'EC1_S1':'', 'EC1_S1':'ELF3', 'EC2_S2':'ELF3', '1166':'LUX', '1167':'LUX', '3239':'REV', '1281':'MPK6', '1278':'SEP3', '1279':'FD', '1280':'FD-like', '1283':'HSP70', '1284':'DELLA', '1762':'FT', 'LFY':'', 'TFL1':''}

agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
f = open(BW_PATHS)

d_cid = {}
for line in f:
    line = line.strip()
    lst = line.split('/')
    bwfile = get_bw_file_name(lst)
    sample_name = get_sample_name(bwfile)
    sample_no   = get_sample_number(bwfile)
    library_id = get_library_id(lst[0])
    name, gid = get_gene_id_and_name(sample_name, name_map_dict)
    path = os.path.join(PARENT_DIR, line)

    if '2_' in library_id:  # some in-house library id starts with 2_, indicating that it is a replicate.
        index = library_id.find('_')
        library_id = library_id[index+1:]
        cid = '@C00011000%s%s' % (library_id.zfill(3), sample_no) # the highest digit of the last 9 digits is 1 to indicate that it is a replicate.  library number, 3 digits. sample number, 2 digits.
    else:
        cid = '@C00010000%s%s' % (library_id.zfill(3), sample_no)

    # handle duplicate library id
    if not cid in d_cid:
        d_cid[cid] = 1
    else: # produce a new library id
        d_cid[cid] += 1
        count = 26
        head = cid[-2:] # last two digits
        while True:
            new_cid = cid[:-2] + '%02d' % (count)
            if count >= 100:
                print('Error: count must be less than 100.')
                sys.exit()
            count += 1
            if not new_cid in d_cid:
                break
        cid = new_cid
        d_cid[cid] = 1

    # print contents for the parameter file
    print(cid)
    print('PROTEIN_ID:%s' % (gid))
    print('PROTEIN_NAME:%s' % (name))    
    print('DATA_NAME:%s' % (sample_name))
    print('DATA_FORMAT:%s' % ('bw'))
    print('DESCRIPTION:in house ChIP data')    
    print('LOCATION:%s' % (path))
    print('NOTE:')
    print('')

f.close()