diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/count_word.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/count_word.py')
-rw-r--r-- | Code/count_word.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/Code/count_word.py b/Code/count_word.py new file mode 100644 index 0000000..cd1d3c6 --- /dev/null +++ b/Code/count_word.py @@ -0,0 +1,36 @@ +# Usage: python count_word.py /home/hui/network/v03/Data/information/rnaseq_info_database.txt +# +# Purpose: get all words in a file, order them by their frequencies. +# +# 20 Apr 2017, slcu, hui + +FILE_NAME = '/home/hui/network/v03/Data/information/rnaseq_info_database.txt' + +from collections import Counter +import operator +import string +import sys + +FILE_NAME = sys.argv[1] + +def remove_punctuation(s): + result = '' + for x in s.split(): + result += ' ' + x.translate(None, string.punctuation) + return result.strip() + +f = open(FILE_NAME) +lines = f.readlines() +f.close() +yourtext = '' +for line in lines[1:]: # don't include header line + line = line.strip() + lst = line.split('\t') + for x in lst[4:]: # only consider fields from 5th column. + yourtext += ' ' + remove_punctuation(x.lower()) + +d = Counter(yourtext.split()) +sd = sorted(d.items(), key=operator.itemgetter(1), reverse=True) +for t in sd: + k = t[0] + print('%s\t\t%s' % (k, d[k])) |