diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/make_upload_chip_parameter.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/make_upload_chip_parameter.py')
-rw-r--r-- | Code/make_upload_chip_parameter.py | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/Code/make_upload_chip_parameter.py b/Code/make_upload_chip_parameter.py new file mode 100644 index 0000000..e6cc4a8 --- /dev/null +++ b/Code/make_upload_chip_parameter.py @@ -0,0 +1,233 @@ +# Usage: python make_upload_chip_parameter.py +# +# Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR. +# Each unique uploaded file will be assigned an ID. +# The assigned ID starts with C0000, followed by 9 digits. +# The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used. +# +# TBD: append to PARAMETER_FOR_BUILDCMATRIX +# Created 20 July 2017, slcu, hui + +import os, sys, glob +from datetime import datetime + +PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt' # [change] +UPLOAD_DIR = '../Data/upload/chipseq' + +INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME' + +def good_file(fname): + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line.startswith('#') and 'STATUS:' in line: + if 'SUCC' in line or 'UNKNOWN' in line: + return True + if 'FAIL' in line: + return False + return False + +def already_included(fname): + ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME''' + f = open(fname) + lines = f.readlines() + n = len(lines) + f.close() + for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included. + line = line.strip() + if line.startswith('#') and INCLUDE_STAMP in line: + return True + return False + + +def same_content(f1, f2): + ''' Test if two file, f1 and f2, have the same content. ''' + if os.path.exists(f1) and not os.path.exists(f2): + return False + if not os.path.exists(f1) and os.path.exists(f2): + return False + if not os.path.exists(f1) and not os.path.exists(f2): + return False + if os.path.exists(f1) and os.path.exists(f2): + a = open(f1) + b = open(f2) + s1 = '' + for line in a: + line = line.strip() + if not line.startswith('#'): # don't include lines starting with '#' + s1 += line + s2 = '' + for line in b: + line = line.strip() + if not line.startswith('#'): + s2 += line + a.close() + b.close() + if s1 == s2: + return True + else: + return False + +def repeat(fname, d): + ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. ''' + for k in d: + if same_content(fname, d[k]['LOCATION']): + return k + return '' + +def update_dict(d, k, fname): + d[k] = make_chip_info_dict(fname) + +# def update_it(upload_dir, upload_dict): +# id_lst = sorted(upload_dict.keys()) +# if id_lst != []: +# last_id = id_lst[-1] +# last_id_number = int(last_id[5:]) +# else: +# last_id_number = 0 +# for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20..... +# if good_file(fname) and not already_included(fname): +# #print(upload_dict) +# k = repeat(fname, upload_dict) +# if k == '': +# k = '%d' % (last_id_number + 1) +# k = 'C0000' + k.zfill(9) +# upload_dict[k] = make_chip_info_dict(fname) +# else: +# update_dict(upload_dict, k, fname) +# mark_it_as_included(fname) + + +def make_chip_info_dict(fname): + ''' Return a dictionary given a user submitted file. ''' + d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''} + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line.startswith('#'): + s = line[(line.rfind('#')+1):] + s = s.strip() + lst = s.split(':') + k = lst[0].strip() + v = line[(line.find(':')+1):] + d[k] = v + + d['DATA_NAME'] = os.path.basename(fname) + d['LOCATION'] = os.path.abspath(fname) + d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d') + return d + +def mark_it_as_included(fname): + ''' Prepend a head line including INCLUDE_STAMP''' + f = open(fname) + s = f.read() + f.close() + f = open(fname, 'w') + curr_time = datetime.now().strftime('%Y-%m-%d %H:%M') + f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s) + f.close() + +def make_string(d): + s = '' + for k in sorted(d.keys()): + s += '@%s\n' % k + s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID'] + s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME'] + s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME'] + s += 'DATA_FORMAT:narrowPeak\n' + s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION'] + s += 'LOCATION:%s\n' % d[k]['LOCATION'] + s += 'NOTE:%s\n\n' % d[k]['NOTE'] + return s + +def md(fname): + ''' Return a dictionary containing the paramter information. ''' + d = {} + if not os.path.exists(fname): + return {} + else: + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line != '' and line.startswith('@'): + k = line[1:] + d[k] = {} + elif line != '': + lst = line.split(':') + k2 = lst[0].strip() + v = line[(line.find(':')+1):] + d[k][k2] = v + return d + +def is_empty(fname): + ''' Return True if fname has no content. ''' + if os.path.exists(fname): + f = open(fname) + s = f.read() + f.close() + return s.strip() == '' + return False + +def get_largest_upload_chip_id(fname): + lst = [] + f = open(fname) + for line in f: + line = line.strip() + if line.startswith('@C0000'): + lst.append(int(line[2:])) + f.close() + if lst != []: + return max(lst) + else: + return 0 + +def make_upload_dict(param_fname, included_path): + d = {} + i = get_largest_upload_chip_id(param_fname) + 1 # staring id + for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first + k = 'C0000' + ('%d' % (i)).zfill(9) + if good_file(fn) and not already_included(fn) and repeat(fn, d) == '': + d[k] = make_chip_info_dict(fn) + i += 1 + if good_file(fn): + mark_it_as_included(fn) + cmd = 'mv %s %s' % (fn, included_path) + os.system(cmd) + + return d + +def append_to_file(fname, s): + f = open(fname, 'a') + f.write('\n' + s + '\n') + f.close() + +def make_directory(my_dir): + if not os.path.exists(my_dir): + os.makedirs(my_dir) + +def make_copy(fname): + if os.path.exists(fname): + curr_time = datetime.now().strftime('%Y%m%d_%H%M%S') + new_fname = fname + '.copy.%s' % (curr_time) + f = open(fname) + s = f.read() + f.close() + f = open(new_fname, 'w') + f.write(s) + f.close() + +## main +included_path = os.path.join(UPLOAD_DIR, 'included') +make_directory(included_path) +upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path) +s = make_string(upload_dict) +if s != '': + # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it + make_copy(PARAMETER_FOR_BUILDCMATRIX) + append_to_file(PARAMETER_FOR_BUILDCMATRIX, s) |