1 files changed, 233 insertions, 0 deletions
diff --git a/Code/make_upload_chip_parameter.py b/Code/make_upload_chip_parameter.py
new file mode 100644
index 0000000..e6cc4a8
--- /dev/null
+++ b/Code/make_upload_chip_parameter.py
@@ -0,0 +1,233 @@
+# Usage: python make_upload_chip_parameter.py
+#
+# Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR.
+#          Each unique uploaded file will be assigned an ID.
+#          The assigned ID starts with C0000, followed by 9 digits.
+#          The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used.
+#
+# TBD: append to PARAMETER_FOR_BUILDCMATRIX
+# Created 20 July 2017, slcu, hui
+
+import os, sys, glob
+from datetime import datetime
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt'  # [change]
+UPLOAD_DIR = '../Data/upload/chipseq'
+
+INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME'
+
+def good_file(fname):
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        if line.startswith('#') and 'STATUS:' in line:
+            if 'SUCC' in line or 'UNKNOWN' in line:
+                return True
+            if 'FAIL' in line:
+                return False
+    return False
+
+def already_included(fname):
+    ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME'''
+    f = open(fname)
+    lines = f.readlines()
+    n = len(lines)
+    f.close()
+    for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included.
+        line = line.strip()
+        if line.startswith('#') and INCLUDE_STAMP in line:
+            return True
+    return False
+        
+
+def same_content(f1, f2):
+    ''' Test if two file, f1 and f2, have the same content. '''
+    if os.path.exists(f1) and not os.path.exists(f2):
+        return False
+    if not os.path.exists(f1) and os.path.exists(f2):
+        return False
+    if not os.path.exists(f1) and not os.path.exists(f2):
+        return False
+    if os.path.exists(f1) and os.path.exists(f2):
+        a = open(f1)
+        b = open(f2)
+        s1 = ''
+        for line in a:
+            line = line.strip()
+            if not line.startswith('#'): # don't include lines starting with '#'
+                s1 += line
+        s2 = ''
+        for line in b:
+            line = line.strip()
+            if not line.startswith('#'):
+                s2 += line
+        a.close()
+        b.close()
+        if s1 == s2:
+            return True
+        else:
+            return False
+
+def repeat(fname, d):
+    ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. '''
+    for k in d:
+        if same_content(fname, d[k]['LOCATION']):
+            return k
+    return ''
+
+def update_dict(d, k, fname):
+    d[k] = make_chip_info_dict(fname)
+
+# def update_it(upload_dir, upload_dict):
+#     id_lst = sorted(upload_dict.keys())
+#     if id_lst != []:
+#         last_id = id_lst[-1]
+#         last_id_number = int(last_id[5:])
+#     else:
+#         last_id_number = 0
+#     for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20.....
+#         if good_file(fname) and not already_included(fname):
+#             #print(upload_dict)
+#             k = repeat(fname, upload_dict)
+#             if k == '':
+#                 k = '%d' % (last_id_number + 1)
+#                 k = 'C0000' + k.zfill(9)
+#                 upload_dict[k] = make_chip_info_dict(fname) 
+#             else:
+#                 update_dict(upload_dict, k, fname)
+#             mark_it_as_included(fname)
+
+
+def make_chip_info_dict(fname):
+    ''' Return a dictionary given a user submitted file. '''
+    d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''}
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        if line.startswith('#'):
+            s = line[(line.rfind('#')+1):]
+            s = s.strip()
+            lst = s.split(':')
+            k = lst[0].strip()
+            v = line[(line.find(':')+1):]
+            d[k] = v
+
+    d['DATA_NAME'] = os.path.basename(fname)
+    d['LOCATION'] = os.path.abspath(fname)
+    d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d')
+    return d
+
+def mark_it_as_included(fname):
+    ''' Prepend a head line including INCLUDE_STAMP'''
+    f = open(fname)
+    s = f.read()
+    f.close()
+    f = open(fname, 'w')
+    curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+    f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s)
+    f.close()
+
+def make_string(d):
+    s = ''
+    for k in sorted(d.keys()):
+        s += '@%s\n' % k
+        s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID']
+        s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME']
+        s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME']
+        s += 'DATA_FORMAT:narrowPeak\n'
+        s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION']
+        s += 'LOCATION:%s\n' % d[k]['LOCATION']
+        s += 'NOTE:%s\n\n' % d[k]['NOTE']
+    return s
+
+def md(fname):
+    ''' Return a dictionary containing the paramter information.  '''
+    d = {}
+    if not os.path.exists(fname):
+        return {}
+    else:
+        f = open(fname)
+        lines = f.readlines()
+        f.close()
+        for line in lines:
+            line = line.strip()
+            if line != '' and line.startswith('@'):
+                k = line[1:]
+                d[k] = {}
+            elif line != '':
+                lst = line.split(':')
+                k2 = lst[0].strip()
+                v = line[(line.find(':')+1):]
+                d[k][k2] = v
+        return d
+
+def is_empty(fname):
+    ''' Return True if fname has no content. '''
+    if os.path.exists(fname):
+        f = open(fname)
+        s = f.read()
+        f.close()
+        return s.strip() == ''
+    return False
+
+def get_largest_upload_chip_id(fname):
+    lst = []
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        if line.startswith('@C0000'):
+            lst.append(int(line[2:]))
+    f.close()
+    if lst != []:
+        return max(lst)
+    else:
+        return 0
+    
+def make_upload_dict(param_fname, included_path):
+    d = {}
+    i = get_largest_upload_chip_id(param_fname) + 1 # staring id
+    for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first
+        k = 'C0000' + ('%d' % (i)).zfill(9)
+        if good_file(fn) and not already_included(fn) and repeat(fn, d) == '':
+            d[k] = make_chip_info_dict(fn)
+            i += 1
+        if good_file(fn):
+            mark_it_as_included(fn)
+        cmd = 'mv %s %s' % (fn, included_path) 
+        os.system(cmd)
+            
+    return d
+                
+def append_to_file(fname, s):
+    f = open(fname, 'a')
+    f.write('\n' + s + '\n')
+    f.close()
+
+def make_directory(my_dir):
+    if not os.path.exists(my_dir):
+        os.makedirs(my_dir)
+
+def make_copy(fname):
+    if os.path.exists(fname):
+        curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+        new_fname = fname + '.copy.%s' % (curr_time)
+        f = open(fname)
+        s = f.read()
+        f.close()
+        f = open(new_fname, 'w')
+        f.write(s)
+        f.close()
+        
+## main
+included_path = os.path.join(UPLOAD_DIR, 'included')
+make_directory(included_path)
+upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path)
+s = make_string(upload_dict)
+if s != '':
+    # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it
+    make_copy(PARAMETER_FOR_BUILDCMATRIX)
+    append_to_file(PARAMETER_FOR_BUILDCMATRIX, s)