# Usage: python make_upload_chip_parameter.py # # Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR. # Each unique uploaded file will be assigned an ID. # The assigned ID starts with C0000, followed by 9 digits. # The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used. # # TBD: append to PARAMETER_FOR_BUILDCMATRIX # Created 20 July 2017, slcu, hui import os, sys, glob from datetime import datetime PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt' # [change] UPLOAD_DIR = '../Data/upload/chipseq' INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME' def good_file(fname): f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line.startswith('#') and 'STATUS:' in line: if 'SUCC' in line or 'UNKNOWN' in line: return True if 'FAIL' in line: return False return False def already_included(fname): ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME''' f = open(fname) lines = f.readlines() n = len(lines) f.close() for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included. line = line.strip() if line.startswith('#') and INCLUDE_STAMP in line: return True return False def same_content(f1, f2): ''' Test if two file, f1 and f2, have the same content. ''' if os.path.exists(f1) and not os.path.exists(f2): return False if not os.path.exists(f1) and os.path.exists(f2): return False if not os.path.exists(f1) and not os.path.exists(f2): return False if os.path.exists(f1) and os.path.exists(f2): a = open(f1) b = open(f2) s1 = '' for line in a: line = line.strip() if not line.startswith('#'): # don't include lines starting with '#' s1 += line s2 = '' for line in b: line = line.strip() if not line.startswith('#'): s2 += line a.close() b.close() if s1 == s2: return True else: return False def repeat(fname, d): ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. ''' for k in d: if same_content(fname, d[k]['LOCATION']): return k return '' def update_dict(d, k, fname): d[k] = make_chip_info_dict(fname) # def update_it(upload_dir, upload_dict): # id_lst = sorted(upload_dict.keys()) # if id_lst != []: # last_id = id_lst[-1] # last_id_number = int(last_id[5:]) # else: # last_id_number = 0 # for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20..... # if good_file(fname) and not already_included(fname): # #print(upload_dict) # k = repeat(fname, upload_dict) # if k == '': # k = '%d' % (last_id_number + 1) # k = 'C0000' + k.zfill(9) # upload_dict[k] = make_chip_info_dict(fname) # else: # update_dict(upload_dict, k, fname) # mark_it_as_included(fname) def make_chip_info_dict(fname): ''' Return a dictionary given a user submitted file. ''' d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''} f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line.startswith('#'): s = line[(line.rfind('#')+1):] s = s.strip() lst = s.split(':') k = lst[0].strip() v = line[(line.find(':')+1):] d[k] = v d['DATA_NAME'] = os.path.basename(fname) d['LOCATION'] = os.path.abspath(fname) d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d') return d def mark_it_as_included(fname): ''' Prepend a head line including INCLUDE_STAMP''' f = open(fname) s = f.read() f.close() f = open(fname, 'w') curr_time = datetime.now().strftime('%Y-%m-%d %H:%M') f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s) f.close() def make_string(d): s = '' for k in sorted(d.keys()): s += '@%s\n' % k s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID'] s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME'] s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME'] s += 'DATA_FORMAT:narrowPeak\n' s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION'] s += 'LOCATION:%s\n' % d[k]['LOCATION'] s += 'NOTE:%s\n\n' % d[k]['NOTE'] return s def md(fname): ''' Return a dictionary containing the paramter information. ''' d = {} if not os.path.exists(fname): return {} else: f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line != '' and line.startswith('@'): k = line[1:] d[k] = {} elif line != '': lst = line.split(':') k2 = lst[0].strip() v = line[(line.find(':')+1):] d[k][k2] = v return d def is_empty(fname): ''' Return True if fname has no content. ''' if os.path.exists(fname): f = open(fname) s = f.read() f.close() return s.strip() == '' return False def get_largest_upload_chip_id(fname): lst = [] f = open(fname) for line in f: line = line.strip() if line.startswith('@C0000'): lst.append(int(line[2:])) f.close() if lst != []: return max(lst) else: return 0 def make_upload_dict(param_fname, included_path): d = {} i = get_largest_upload_chip_id(param_fname) + 1 # staring id for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first k = 'C0000' + ('%d' % (i)).zfill(9) if good_file(fn) and not already_included(fn) and repeat(fn, d) == '': d[k] = make_chip_info_dict(fn) i += 1 if good_file(fn): mark_it_as_included(fn) cmd = 'mv %s %s' % (fn, included_path) os.system(cmd) return d def append_to_file(fname, s): f = open(fname, 'a') f.write('\n' + s + '\n') f.close() def make_directory(my_dir): if not os.path.exists(my_dir): os.makedirs(my_dir) def make_copy(fname): if os.path.exists(fname): curr_time = datetime.now().strftime('%Y%m%d_%H%M%S') new_fname = fname + '.copy.%s' % (curr_time) f = open(fname) s = f.read() f.close() f = open(new_fname, 'w') f.write(s) f.close() ## main included_path = os.path.join(UPLOAD_DIR, 'included') make_directory(included_path) upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path) s = make_string(upload_dict) if s != '': # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it make_copy(PARAMETER_FOR_BUILDCMATRIX) append_to_file(PARAMETER_FOR_BUILDCMATRIX, s)