From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001
From: Hui Lan <lanhui@zjnu.edu.cn>
Date: Wed, 4 Dec 2019 19:03:19 +0800
Subject: brain: add python and R code to local repository.

---
 Code/make_parameter_dapseq3.py | 75 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 Code/make_parameter_dapseq3.py

(limited to 'Code/make_parameter_dapseq3.py')

diff --git a/Code/make_parameter_dapseq3.py b/Code/make_parameter_dapseq3.py
new file mode 100644
index 0000000..405b3dc
--- /dev/null
+++ b/Code/make_parameter_dapseq3.py
@@ -0,0 +1,75 @@
+# Usage: python make_parameter_dapseq3.py
+# Because dap-seq dose not include all TFs, so include other TFs. The idea is that TFs within a same family are very conservative in binding.
+# 
+
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+DAPSEQ_DIR = '/home/hui/network/dapseq_merged'
+MAP_FILE = '/home/hui/network/dapseq_merged/tffamily.simple.txt'
+    
+def get_name(s):
+    lst = s.split('_')
+    result = []
+    for x in lst:
+        if x != 'tnt':
+            result.append(x)
+    return '_'.join(result)
+
+def make_dapseq_dictionary(dirname):
+
+    d = {}
+
+    files = glob.glob(os.path.join(dirname, '*.narrowPeak'))
+
+    for f in files:
+        lst = f.split('/')
+        tf_name = lst[-1].split('.')[0]
+        tf_name = get_name(tf_name)
+        if not tf_name in d:
+            d[tf_name] = f
+        else:
+            print('ERROR: transcription factor name not unique.')
+            sys.exit()
+
+    return d
+
+
+d = make_dapseq_dictionary(DAPSEQ_DIR)
+
+f = open(MAP_FILE)
+lines = f.readlines()
+f.close()
+
+# since MAP_FILE contain duplicate lines
+d_family = {}
+for line in lines:
+    line = line.strip()
+    lst = line.split()
+    tf = lst[0].upper()
+    tf_name = lst[1]
+    family = lst[2]
+    if not tf in d_family:
+        d_family[tf] = (tf_name, family)
+    else:
+        if family != d_family[tf][1]:
+            print('WARNING: %s conflict [%s %s]!' % (tf, family, d_family[tf][1]))
+            
+count = 1
+for k in sorted(d_family.keys()):
+    g = k
+    gname = d_family[k][0]
+    key = d_family[k][1]
+    if key in d:
+        cid = 'C0003%09d' % (count)
+        count += 1
+        print('@%s' % (cid))
+        print('PROTEIN_ID:%s' % (g))
+        print('PROTEIN_NAME:%s' % (gname))
+        print('DATA_NAME:%s' % (gname))
+        print('DATA_FORMAT:%s' % ('narrowPeak'))
+        print('DESCRIPTION:inferred from dapseq')
+        #print('LOCATION:%s' % (os.path.join(DAPSEQ_DIR, d[key])))
+        print('LOCATION:%s' % (d[key]))        
+        print('NOTE:')
+        print('')
-- 
cgit v1.2.1