diff options
Diffstat (limited to 'Code')
| -rw-r--r-- | Code/make_target_tf_agris.py | 94 | 
1 files changed, 56 insertions, 38 deletions
| diff --git a/Code/make_target_tf_agris.py b/Code/make_target_tf_agris.py index c96c8fa..8e2cb2e 100644 --- a/Code/make_target_tf_agris.py +++ b/Code/make_target_tf_agris.py @@ -1,39 +1,57 @@ -# Make target_tf from AtRegNet.txt -# Usage: python make_target_tf_agris.py > ../Data/information/target_tf_agris.txt - -fname = '../Data/information/AtRegNet.txt' - -sample_id = 'C0000000000001' - -f = open(fname) -lines = f.readlines() -f.close() - -d = {} -count = 2 -duplicate = 0 -for line in lines[1:]: -    line = line.strip() -    lst = line.split('\t') -    if len(lst) >= 5: -        tf0     = lst[1].upper().strip() -        target0 = lst[4].upper().strip() -        tf_lst = tf0.split('/') -        target_lst = target0.split('/') -        for tf in tf_lst: -            for target in target_lst: -                if tf.startswith('AT') and target.startswith('AT'): -                    k = target + '.' + tf -                    if k in d: -                        #print('Warning at line %d  ' % (count)) -                        duplicate += 1 -                    else: -                        d[k] = [target, tf, sample_id] -    count += 1 - - -print('pairs %d' % len(d)) -print('duplicate %d' % (duplicate)) -for k in sorted(d.keys()): -    print('\t'.join(d[k])) +# Make target_tf from AtRegNet.txt and from AtRegNet.csv (much bigger). +# Usage: python3 make_target_tf_agris.py > ../Data/information/target_tf_agris.txt +# Last modified on 16 Feb 2021 by Hui Lan <lanhui@zjnu.edu.cn> +import codecs, sys + +class GeneIdValidator: +    def __init__(self, organism): +        self.org = organism +    def isvalid(self, ID): +        if self.org == 'ath': +            return ID.startswith('AT') and len(ID) == 9 and ID[3] == 'G' +        else: +            return False + + +def make_dictionary(fname, separator): +    sample_id = 'C0000000000001' + +    id_validator = GeneIdValidator('ath') + +    f = codecs.open(fname, encoding='utf-8', errors='ignore') +    lines = f.readlines() +    f.close() +     +    d = {} +    duplicate = 0 +    for line in lines[1:]: +        line = line.strip() +        lst = line.split(separator) +        if len(lst) >= 5: +            tf0     = lst[1].upper().strip() +            target0 = lst[4].upper().strip() +            tf_lst = [x.strip() for x in tf0.split('/') if id_validator.isvalid(x)] +            target_lst = [x.strip() for x in target0.split('/') if id_validator.isvalid(x)] +            for tf in tf_lst: +                for target in target_lst: +                    if tf.startswith('AT') and target.startswith('AT'): +                        k = target + '.' + tf +                        if k in d: +                            duplicate += 1 +                        else: +                            d[k] = [target, tf, sample_id] + +    return duplicate, d + + +if __name__ == '__main__': +    duplicate1, d = make_dictionary('../Data/information/AtRegNet.txt', '\t')  # OSU AtRegNet, an older version.  Most are confirmed tf target pairs??? +    duplicate2, d2 = make_dictionary('../Data/information/AtRegNet2.csv', ',') # OSU AtRegNet downloaded in Feb 2021.  This file includes much more tf target pairs (confirmed, unconfirmed, dap-seq, etc)     +    d.update(d2) # combine two dictionaries into one. +     +    print('pairs: %d' % len(d), file=sys.stderr) +    print('duplicate: %d' % (duplicate1 + duplicate2), file=sys.stderr) +    for k in sorted(d.keys()): +        print('\t'.join(d[k])) +     | 
