make_target_tf_agris.py: refactor code, and include tf-target pairs from the latest osu AtRegNet data.

author: Hui Lan <lanhui@zjnu.edu.cn> 2021-02-16 11:39:51 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2021-02-16 11:39:51 +0800
commit: 676b4e16e03f128d26c3c5142eef953319b1f23b (patch)
tree: 3347ec1b58bb1ff110cfc460be2dfe437d5d7ba7 /Code
parent: 9f0f2149a008362558cdb18953f734285af6fd38 (diff)
1 files changed, 56 insertions, 38 deletions
diff --git a/Code/make_target_tf_agris.py b/Code/make_target_tf_agris.py
index c96c8fa..8e2cb2e 100644
--- a/Code/make_target_tf_agris.py
+++ b/Code/make_target_tf_agris.py
@@ -1,39 +1,57 @@
-# Make target_tf from AtRegNet.txt
-# Usage: python make_target_tf_agris.py > ../Data/information/target_tf_agris.txt
-
-fname = '../Data/information/AtRegNet.txt'
-
-sample_id = 'C0000000000001'
-
-f = open(fname)
-lines = f.readlines()
-f.close()
-
-d = {}
-count = 2
-duplicate = 0
-for line in lines[1:]:
-    line = line.strip()
-    lst = line.split('\t')
-    if len(lst) >= 5:
-        tf0     = lst[1].upper().strip()
-        target0 = lst[4].upper().strip()
-        tf_lst = tf0.split('/')
-        target_lst = target0.split('/')
-        for tf in tf_lst:
-            for target in target_lst:
-                if tf.startswith('AT') and target.startswith('AT'):
-                    k = target + '.' + tf
-                    if k in d:
-                        #print('Warning at line %d  ' % (count))
-                        duplicate += 1
-                    else:
-                        d[k] = [target, tf, sample_id]
-    count += 1
-
-
-print('pairs %d' % len(d))
-print('duplicate %d' % (duplicate))
-for k in sorted(d.keys()):
-    print('\t'.join(d[k]))
+# Make target_tf from AtRegNet.txt and from AtRegNet.csv (much bigger).
+# Usage: python3 make_target_tf_agris.py > ../Data/information/target_tf_agris.txt
+# Last modified on 16 Feb 2021 by Hui Lan <lanhui@zjnu.edu.cn>
 
+import codecs, sys
+
+class GeneIdValidator:
+    def __init__(self, organism):
+        self.org = organism
+    def isvalid(self, ID):
+        if self.org == 'ath':
+            return ID.startswith('AT') and len(ID) == 9 and ID[3] == 'G'
+        else:
+            return False
+
+
+def make_dictionary(fname, separator):
+    sample_id = 'C0000000000001'
+
+    id_validator = GeneIdValidator('ath')
+
+    f = codecs.open(fname, encoding='utf-8', errors='ignore')
+    lines = f.readlines()
+    f.close()
+    
+    d = {}
+    duplicate = 0
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split(separator)
+        if len(lst) >= 5:
+            tf0     = lst[1].upper().strip()
+            target0 = lst[4].upper().strip()
+            tf_lst = [x.strip() for x in tf0.split('/') if id_validator.isvalid(x)]
+            target_lst = [x.strip() for x in target0.split('/') if id_validator.isvalid(x)]
+            for tf in tf_lst:
+                for target in target_lst:
+                    if tf.startswith('AT') and target.startswith('AT'):
+                        k = target + '.' + tf
+                        if k in d:
+                            duplicate += 1
+                        else:
+                            d[k] = [target, tf, sample_id]
+
+    return duplicate, d
+
+
+if __name__ == '__main__':
+    duplicate1, d = make_dictionary('../Data/information/AtRegNet.txt', '\t')  # OSU AtRegNet, an older version.  Most are confirmed tf target pairs???
+    duplicate2, d2 = make_dictionary('../Data/information/AtRegNet2.csv', ',') # OSU AtRegNet downloaded in Feb 2021.  This file includes much more tf target pairs (confirmed, unconfirmed, dap-seq, etc)    
+    d.update(d2) # combine two dictionaries into one.
+    
+    print('pairs: %d' % len(d), file=sys.stderr)
+    print('duplicate: %d' % (duplicate1 + duplicate2), file=sys.stderr)
+    for k in sorted(d.keys()):
+        print('\t'.join(d[k]))
+
author	Hui Lan <lanhui@zjnu.edu.cn>	2021-02-16 11:39:51 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2021-02-16 11:39:51 +0800
commit	676b4e16e03f128d26c3c5142eef953319b1f23b (patch)
tree	3347ec1b58bb1ff110cfc460be2dfe437d5d7ba7 /Code
parent	9f0f2149a008362558cdb18953f734285af6fd38 (diff)