diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2021-02-16 11:39:51 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2021-02-16 11:39:51 +0800 |
commit | 676b4e16e03f128d26c3c5142eef953319b1f23b (patch) | |
tree | 3347ec1b58bb1ff110cfc460be2dfe437d5d7ba7 /Code/make_target_tf_agris.py | |
parent | 9f0f2149a008362558cdb18953f734285af6fd38 (diff) |
make_target_tf_agris.py: refactor code, and include tf-target pairs from the latest osu AtRegNet data.
Diffstat (limited to 'Code/make_target_tf_agris.py')
-rw-r--r-- | Code/make_target_tf_agris.py | 94 |
1 files changed, 56 insertions, 38 deletions
diff --git a/Code/make_target_tf_agris.py b/Code/make_target_tf_agris.py index c96c8fa..8e2cb2e 100644 --- a/Code/make_target_tf_agris.py +++ b/Code/make_target_tf_agris.py @@ -1,39 +1,57 @@ -# Make target_tf from AtRegNet.txt -# Usage: python make_target_tf_agris.py > ../Data/information/target_tf_agris.txt - -fname = '../Data/information/AtRegNet.txt' - -sample_id = 'C0000000000001' - -f = open(fname) -lines = f.readlines() -f.close() - -d = {} -count = 2 -duplicate = 0 -for line in lines[1:]: - line = line.strip() - lst = line.split('\t') - if len(lst) >= 5: - tf0 = lst[1].upper().strip() - target0 = lst[4].upper().strip() - tf_lst = tf0.split('/') - target_lst = target0.split('/') - for tf in tf_lst: - for target in target_lst: - if tf.startswith('AT') and target.startswith('AT'): - k = target + '.' + tf - if k in d: - #print('Warning at line %d ' % (count)) - duplicate += 1 - else: - d[k] = [target, tf, sample_id] - count += 1 - - -print('pairs %d' % len(d)) -print('duplicate %d' % (duplicate)) -for k in sorted(d.keys()): - print('\t'.join(d[k])) +# Make target_tf from AtRegNet.txt and from AtRegNet.csv (much bigger). +# Usage: python3 make_target_tf_agris.py > ../Data/information/target_tf_agris.txt +# Last modified on 16 Feb 2021 by Hui Lan <lanhui@zjnu.edu.cn> +import codecs, sys + +class GeneIdValidator: + def __init__(self, organism): + self.org = organism + def isvalid(self, ID): + if self.org == 'ath': + return ID.startswith('AT') and len(ID) == 9 and ID[3] == 'G' + else: + return False + + +def make_dictionary(fname, separator): + sample_id = 'C0000000000001' + + id_validator = GeneIdValidator('ath') + + f = codecs.open(fname, encoding='utf-8', errors='ignore') + lines = f.readlines() + f.close() + + d = {} + duplicate = 0 + for line in lines[1:]: + line = line.strip() + lst = line.split(separator) + if len(lst) >= 5: + tf0 = lst[1].upper().strip() + target0 = lst[4].upper().strip() + tf_lst = [x.strip() for x in tf0.split('/') if id_validator.isvalid(x)] + target_lst = [x.strip() for x in target0.split('/') if id_validator.isvalid(x)] + for tf in tf_lst: + for target in target_lst: + if tf.startswith('AT') and target.startswith('AT'): + k = target + '.' + tf + if k in d: + duplicate += 1 + else: + d[k] = [target, tf, sample_id] + + return duplicate, d + + +if __name__ == '__main__': + duplicate1, d = make_dictionary('../Data/information/AtRegNet.txt', '\t') # OSU AtRegNet, an older version. Most are confirmed tf target pairs??? + duplicate2, d2 = make_dictionary('../Data/information/AtRegNet2.csv', ',') # OSU AtRegNet downloaded in Feb 2021. This file includes much more tf target pairs (confirmed, unconfirmed, dap-seq, etc) + d.update(d2) # combine two dictionaries into one. + + print('pairs: %d' % len(d), file=sys.stderr) + print('duplicate: %d' % (duplicate1 + duplicate2), file=sys.stderr) + for k in sorted(d.keys()): + print('\t'.join(d[k])) + |