path: root/Code
diff options
authorHui Lan <>2021-02-16 11:39:51 +0800
committerHui Lan <>2021-02-16 11:39:51 +0800
commit676b4e16e03f128d26c3c5142eef953319b1f23b (patch)
tree3347ec1b58bb1ff110cfc460be2dfe437d5d7ba7 /Code
parent9f0f2149a008362558cdb18953f734285af6fd38 (diff) refactor code, and include tf-target pairs from the latest osu AtRegNet data.
Diffstat (limited to 'Code')
1 files changed, 56 insertions, 38 deletions
diff --git a/Code/ b/Code/
index c96c8fa..8e2cb2e 100644
--- a/Code/
+++ b/Code/
@@ -1,39 +1,57 @@
-# Make target_tf from AtRegNet.txt
-# Usage: python > ../Data/information/target_tf_agris.txt
-fname = '../Data/information/AtRegNet.txt'
-sample_id = 'C0000000000001'
-f = open(fname)
-lines = f.readlines()
-d = {}
-count = 2
-duplicate = 0
-for line in lines[1:]:
- line = line.strip()
- lst = line.split('\t')
- if len(lst) >= 5:
- tf0 = lst[1].upper().strip()
- target0 = lst[4].upper().strip()
- tf_lst = tf0.split('/')
- target_lst = target0.split('/')
- for tf in tf_lst:
- for target in target_lst:
- if tf.startswith('AT') and target.startswith('AT'):
- k = target + '.' + tf
- if k in d:
- #print('Warning at line %d ' % (count))
- duplicate += 1
- else:
- d[k] = [target, tf, sample_id]
- count += 1
-print('pairs %d' % len(d))
-print('duplicate %d' % (duplicate))
-for k in sorted(d.keys()):
- print('\t'.join(d[k]))
+# Make target_tf from AtRegNet.txt and from AtRegNet.csv (much bigger).
+# Usage: python3 > ../Data/information/target_tf_agris.txt
+# Last modified on 16 Feb 2021 by Hui Lan <>
+import codecs, sys
+class GeneIdValidator:
+ def __init__(self, organism):
+ = organism
+ def isvalid(self, ID):
+ if == 'ath':
+ return ID.startswith('AT') and len(ID) == 9 and ID[3] == 'G'
+ else:
+ return False
+def make_dictionary(fname, separator):
+ sample_id = 'C0000000000001'
+ id_validator = GeneIdValidator('ath')
+ f =, encoding='utf-8', errors='ignore')
+ lines = f.readlines()
+ f.close()
+ d = {}
+ duplicate = 0
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split(separator)
+ if len(lst) >= 5:
+ tf0 = lst[1].upper().strip()
+ target0 = lst[4].upper().strip()
+ tf_lst = [x.strip() for x in tf0.split('/') if id_validator.isvalid(x)]
+ target_lst = [x.strip() for x in target0.split('/') if id_validator.isvalid(x)]
+ for tf in tf_lst:
+ for target in target_lst:
+ if tf.startswith('AT') and target.startswith('AT'):
+ k = target + '.' + tf
+ if k in d:
+ duplicate += 1
+ else:
+ d[k] = [target, tf, sample_id]
+ return duplicate, d
+if __name__ == '__main__':
+ duplicate1, d = make_dictionary('../Data/information/AtRegNet.txt', '\t') # OSU AtRegNet, an older version. Most are confirmed tf target pairs???
+ duplicate2, d2 = make_dictionary('../Data/information/AtRegNet2.csv', ',') # OSU AtRegNet downloaded in Feb 2021. This file includes much more tf target pairs (confirmed, unconfirmed, dap-seq, etc)
+ d.update(d2) # combine two dictionaries into one.
+ print('pairs: %d' % len(d), file=sys.stderr)
+ print('duplicate: %d' % (duplicate1 + duplicate2), file=sys.stderr)
+ for k in sorted(d.keys()):
+ print('\t'.join(d[k]))