summaryrefslogtreecommitdiff
path: root/Code
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2021-02-16 11:39:51 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2021-02-16 11:39:51 +0800
commit676b4e16e03f128d26c3c5142eef953319b1f23b (patch)
tree3347ec1b58bb1ff110cfc460be2dfe437d5d7ba7 /Code
parent9f0f2149a008362558cdb18953f734285af6fd38 (diff)
make_target_tf_agris.py: refactor code, and include tf-target pairs from the latest osu AtRegNet data.
Diffstat (limited to 'Code')
-rw-r--r--Code/make_target_tf_agris.py94
1 files changed, 56 insertions, 38 deletions
diff --git a/Code/make_target_tf_agris.py b/Code/make_target_tf_agris.py
index c96c8fa..8e2cb2e 100644
--- a/Code/make_target_tf_agris.py
+++ b/Code/make_target_tf_agris.py
@@ -1,39 +1,57 @@
-# Make target_tf from AtRegNet.txt
-# Usage: python make_target_tf_agris.py > ../Data/information/target_tf_agris.txt
-
-fname = '../Data/information/AtRegNet.txt'
-
-sample_id = 'C0000000000001'
-
-f = open(fname)
-lines = f.readlines()
-f.close()
-
-d = {}
-count = 2
-duplicate = 0
-for line in lines[1:]:
- line = line.strip()
- lst = line.split('\t')
- if len(lst) >= 5:
- tf0 = lst[1].upper().strip()
- target0 = lst[4].upper().strip()
- tf_lst = tf0.split('/')
- target_lst = target0.split('/')
- for tf in tf_lst:
- for target in target_lst:
- if tf.startswith('AT') and target.startswith('AT'):
- k = target + '.' + tf
- if k in d:
- #print('Warning at line %d ' % (count))
- duplicate += 1
- else:
- d[k] = [target, tf, sample_id]
- count += 1
-
-
-print('pairs %d' % len(d))
-print('duplicate %d' % (duplicate))
-for k in sorted(d.keys()):
- print('\t'.join(d[k]))
+# Make target_tf from AtRegNet.txt and from AtRegNet.csv (much bigger).
+# Usage: python3 make_target_tf_agris.py > ../Data/information/target_tf_agris.txt
+# Last modified on 16 Feb 2021 by Hui Lan <lanhui@zjnu.edu.cn>
+import codecs, sys
+
+class GeneIdValidator:
+ def __init__(self, organism):
+ self.org = organism
+ def isvalid(self, ID):
+ if self.org == 'ath':
+ return ID.startswith('AT') and len(ID) == 9 and ID[3] == 'G'
+ else:
+ return False
+
+
+def make_dictionary(fname, separator):
+ sample_id = 'C0000000000001'
+
+ id_validator = GeneIdValidator('ath')
+
+ f = codecs.open(fname, encoding='utf-8', errors='ignore')
+ lines = f.readlines()
+ f.close()
+
+ d = {}
+ duplicate = 0
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split(separator)
+ if len(lst) >= 5:
+ tf0 = lst[1].upper().strip()
+ target0 = lst[4].upper().strip()
+ tf_lst = [x.strip() for x in tf0.split('/') if id_validator.isvalid(x)]
+ target_lst = [x.strip() for x in target0.split('/') if id_validator.isvalid(x)]
+ for tf in tf_lst:
+ for target in target_lst:
+ if tf.startswith('AT') and target.startswith('AT'):
+ k = target + '.' + tf
+ if k in d:
+ duplicate += 1
+ else:
+ d[k] = [target, tf, sample_id]
+
+ return duplicate, d
+
+
+if __name__ == '__main__':
+ duplicate1, d = make_dictionary('../Data/information/AtRegNet.txt', '\t') # OSU AtRegNet, an older version. Most are confirmed tf target pairs???
+ duplicate2, d2 = make_dictionary('../Data/information/AtRegNet2.csv', ',') # OSU AtRegNet downloaded in Feb 2021. This file includes much more tf target pairs (confirmed, unconfirmed, dap-seq, etc)
+ d.update(d2) # combine two dictionaries into one.
+
+ print('pairs: %d' % len(d), file=sys.stderr)
+ print('duplicate: %d' % (duplicate1 + duplicate2), file=sys.stderr)
+ for k in sorted(d.keys()):
+ print('\t'.join(d[k]))
+