# Usage: python count_word.py /home/hui/network/v03/Data/information/rnaseq_info_database.txt # # Purpose: get all words in a file, order them by their frequencies. # # 20 Apr 2017, slcu, hui FILE_NAME = '/home/hui/network/v03/Data/information/rnaseq_info_database.txt' from collections import Counter import operator import string import sys FILE_NAME = sys.argv[1] def remove_punctuation(s): result = '' for x in s.split(): result += ' ' + x.translate(None, string.punctuation) return result.strip() f = open(FILE_NAME) lines = f.readlines() f.close() yourtext = '' for line in lines[1:]: # don't include header line line = line.strip() lst = line.split('\t') for x in lst[4:]: # only consider fields from 5th column. yourtext += ' ' + remove_punctuation(x.lower()) d = Counter(yourtext.split()) sd = sorted(d.items(), key=operator.itemgetter(1), reverse=True) for t in sd: k = t[0] print('%s\t\t%s' % (k, d[k]))