summaryrefslogtreecommitdiff
path: root/Code/make_upload_chip_parameter.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/make_upload_chip_parameter.py')
-rw-r--r--Code/make_upload_chip_parameter.py233
1 files changed, 233 insertions, 0 deletions
diff --git a/Code/make_upload_chip_parameter.py b/Code/make_upload_chip_parameter.py
new file mode 100644
index 0000000..e6cc4a8
--- /dev/null
+++ b/Code/make_upload_chip_parameter.py
@@ -0,0 +1,233 @@
+# Usage: python make_upload_chip_parameter.py
+#
+# Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR.
+# Each unique uploaded file will be assigned an ID.
+# The assigned ID starts with C0000, followed by 9 digits.
+# The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used.
+#
+# TBD: append to PARAMETER_FOR_BUILDCMATRIX
+# Created 20 July 2017, slcu, hui
+
+import os, sys, glob
+from datetime import datetime
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt' # [change]
+UPLOAD_DIR = '../Data/upload/chipseq'
+
+INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME'
+
+def good_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line.startswith('#') and 'STATUS:' in line:
+ if 'SUCC' in line or 'UNKNOWN' in line:
+ return True
+ if 'FAIL' in line:
+ return False
+ return False
+
+def already_included(fname):
+ ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME'''
+ f = open(fname)
+ lines = f.readlines()
+ n = len(lines)
+ f.close()
+ for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included.
+ line = line.strip()
+ if line.startswith('#') and INCLUDE_STAMP in line:
+ return True
+ return False
+
+
+def same_content(f1, f2):
+ ''' Test if two file, f1 and f2, have the same content. '''
+ if os.path.exists(f1) and not os.path.exists(f2):
+ return False
+ if not os.path.exists(f1) and os.path.exists(f2):
+ return False
+ if not os.path.exists(f1) and not os.path.exists(f2):
+ return False
+ if os.path.exists(f1) and os.path.exists(f2):
+ a = open(f1)
+ b = open(f2)
+ s1 = ''
+ for line in a:
+ line = line.strip()
+ if not line.startswith('#'): # don't include lines starting with '#'
+ s1 += line
+ s2 = ''
+ for line in b:
+ line = line.strip()
+ if not line.startswith('#'):
+ s2 += line
+ a.close()
+ b.close()
+ if s1 == s2:
+ return True
+ else:
+ return False
+
+def repeat(fname, d):
+ ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. '''
+ for k in d:
+ if same_content(fname, d[k]['LOCATION']):
+ return k
+ return ''
+
+def update_dict(d, k, fname):
+ d[k] = make_chip_info_dict(fname)
+
+# def update_it(upload_dir, upload_dict):
+# id_lst = sorted(upload_dict.keys())
+# if id_lst != []:
+# last_id = id_lst[-1]
+# last_id_number = int(last_id[5:])
+# else:
+# last_id_number = 0
+# for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20.....
+# if good_file(fname) and not already_included(fname):
+# #print(upload_dict)
+# k = repeat(fname, upload_dict)
+# if k == '':
+# k = '%d' % (last_id_number + 1)
+# k = 'C0000' + k.zfill(9)
+# upload_dict[k] = make_chip_info_dict(fname)
+# else:
+# update_dict(upload_dict, k, fname)
+# mark_it_as_included(fname)
+
+
+def make_chip_info_dict(fname):
+ ''' Return a dictionary given a user submitted file. '''
+ d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line.startswith('#'):
+ s = line[(line.rfind('#')+1):]
+ s = s.strip()
+ lst = s.split(':')
+ k = lst[0].strip()
+ v = line[(line.find(':')+1):]
+ d[k] = v
+
+ d['DATA_NAME'] = os.path.basename(fname)
+ d['LOCATION'] = os.path.abspath(fname)
+ d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d')
+ return d
+
+def mark_it_as_included(fname):
+ ''' Prepend a head line including INCLUDE_STAMP'''
+ f = open(fname)
+ s = f.read()
+ f.close()
+ f = open(fname, 'w')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s)
+ f.close()
+
+def make_string(d):
+ s = ''
+ for k in sorted(d.keys()):
+ s += '@%s\n' % k
+ s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID']
+ s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME']
+ s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME']
+ s += 'DATA_FORMAT:narrowPeak\n'
+ s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION']
+ s += 'LOCATION:%s\n' % d[k]['LOCATION']
+ s += 'NOTE:%s\n\n' % d[k]['NOTE']
+ return s
+
+def md(fname):
+ ''' Return a dictionary containing the paramter information. '''
+ d = {}
+ if not os.path.exists(fname):
+ return {}
+ else:
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line != '' and line.startswith('@'):
+ k = line[1:]
+ d[k] = {}
+ elif line != '':
+ lst = line.split(':')
+ k2 = lst[0].strip()
+ v = line[(line.find(':')+1):]
+ d[k][k2] = v
+ return d
+
+def is_empty(fname):
+ ''' Return True if fname has no content. '''
+ if os.path.exists(fname):
+ f = open(fname)
+ s = f.read()
+ f.close()
+ return s.strip() == ''
+ return False
+
+def get_largest_upload_chip_id(fname):
+ lst = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@C0000'):
+ lst.append(int(line[2:]))
+ f.close()
+ if lst != []:
+ return max(lst)
+ else:
+ return 0
+
+def make_upload_dict(param_fname, included_path):
+ d = {}
+ i = get_largest_upload_chip_id(param_fname) + 1 # staring id
+ for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first
+ k = 'C0000' + ('%d' % (i)).zfill(9)
+ if good_file(fn) and not already_included(fn) and repeat(fn, d) == '':
+ d[k] = make_chip_info_dict(fn)
+ i += 1
+ if good_file(fn):
+ mark_it_as_included(fn)
+ cmd = 'mv %s %s' % (fn, included_path)
+ os.system(cmd)
+
+ return d
+
+def append_to_file(fname, s):
+ f = open(fname, 'a')
+ f.write('\n' + s + '\n')
+ f.close()
+
+def make_directory(my_dir):
+ if not os.path.exists(my_dir):
+ os.makedirs(my_dir)
+
+def make_copy(fname):
+ if os.path.exists(fname):
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+ new_fname = fname + '.copy.%s' % (curr_time)
+ f = open(fname)
+ s = f.read()
+ f.close()
+ f = open(new_fname, 'w')
+ f.write(s)
+ f.close()
+
+## main
+included_path = os.path.join(UPLOAD_DIR, 'included')
+make_directory(included_path)
+upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path)
+s = make_string(upload_dict)
+if s != '':
+ # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it
+ make_copy(PARAMETER_FOR_BUILDCMATRIX)
+ append_to_file(PARAMETER_FOR_BUILDCMATRIX, s)