commit 44b942868519733f79baee8b8e529dd1d600c941 Author: Mutah Date: Mon Dec 7 15:30:02 2015 +0100 Initial add diff --git a/README.md b/README.md new file mode 100644 index 0000000..e889937 --- /dev/null +++ b/README.md @@ -0,0 +1,51 @@ +#WordOcc + +A word frequency tool that outputs sorted results in csv format, supporting stop words. + +# Requirements + + - Python 2.6 + - TextBlob https://textblob.readthedocs.org/en/dev/ + +# Usage + +## Basic + + python wordocc.py a_interesting_text.txt + +Outputs such content in wordocc.csv : + + top,43 + image,31 + sample,29 + ... + +## Options + + wordocc.py -h + Usage: wordocc.py [options] FILE + + Options: + -h, --help show this help message and exit + -s STOP_WORDS, --stop-words=STOP_WORDS + path to stop word file + -o OUTPUT, --output=OUTPUT + csv output filename + -e ENCODING, --encoding=ENCODING + file encoding (default: utf-8) + + +## Stop words + +### Introduction + +Stop words are words that are not interesting for the statistic study, like articles, conjunctions, etc ... + +You have to provide a file containing those words (one per line). Following files can help : + + - English : http://snowball.tartarus.org/algorithms/english/stop.txt + - French :http://snowball.tartarus.org/algorithms/french/stop.txt + +### Usage + + python wordocc.py -e /home/jdoe/en/stop.txt a_interesting_text.txt \ No newline at end of file diff --git a/wordocc.py b/wordocc.py new file mode 100755 index 0000000..5858901 --- /dev/null +++ b/wordocc.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +WordOcc + +Output csv with word,frequency + +Stop Words support. You may try : + http://snowball.tartarus.org/algorithms/english/stop.txt + http://snowball.tartarus.org/algorithms/french/stop.txt +""" +import os, sys, codecs, re, optparse, logging + +try: + from textblob import TextBlob +except ImportError: + sys.stderr("""textblob library not found. +See https://textblob.readthedocs.org/en/dev/""") + sys.exit(1) + +# settings +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +# path stop word file +STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt') + +DEFAULT_ENCODING = "utf-8" + +############################################################################# +def load_stop_words(path, encoding): + """load stop words list, handling comments with '|'""" + stop_words = set() + for line in codecs.open(path, "r", encoding): + line = line.strip() + if '|' in line: + line = line.split('|')[0].strip() + if line: + stop_words.add(line.lower()) + return stop_words + +############################################################################# +def load_text(path, encoding): + """ handle trailing caret / word cesure""" + f = codecs.open(path, "r", encoding) + lines = [] + for line in f: + if lines and lines[-1].endswith('-\n'): + lines[-1] = lines[-1].replace('-\n', '').strip() + line + else: + lines.append(line) + f.close() + return u"".join(lines).replace("\n\n", ". \n") + +############################################################################# +def process(text, stop_words): + """use textblob for word parsing and add some processing + """ + blob = TextBlob(text) + stats = {} + + for word in blob.words: + word = word.strip() + # skip empty and numbers + if not word \ + or word.lower() in stop_words \ + or word.isdigit() or word[0].isdigit(): + continue + + # Extract word from list item : "1.Chose" -> "Chose" + if re.search('\w+\.\d+$', word): + word = word.split('.')[0] + + # thoses characters are not striped by TextBlob + word = word.replace(u"»", "") + word = word.replace(u"«", "") + + # handle words starting by "l'" or "d'" : "l'avion" -> "avion" + if "'" in word: + word = word[word.find("'")+1:] + + # final filtering + if len(word) > 1 and not word in stop_words: + if not word in stats: + stats[word] = 0 + stats[word] += 1 + return stats + +############################################################################# +def save_stats_csv(stats, path, encoding): + """Write csv stats""" + # Sort words by top count + words = stats.keys() + words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True) + f = codecs.open(path, "w", encoding) + # Output results + for word in words: + f.write(u"%s,%d\n" % (word, stats[word])) + +############################################################################# +if __name__ == '__main__': + parser = optparse.OptionParser("usage: %prog [options] FILE") + parser.add_option("-s", "--stop-words", dest="stop_words", + default="", type="string", + help="path to stop word file") + + parser.add_option("-o", "--output", dest="output", default="wordocc.csv", + type="string", help="csv output filename") + + parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING, + type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING) + + (options, args) = parser.parse_args() + if len(args) != 1: + parser.error("incorrect number of arguments") + + text = load_text(args[0], options.encoding) + + if options.stop_words: + stop_words = load_stop_words(options.stop_words, options.encoding) + else: + stop_words = set() + + stats = process(text, stop_words) + + save_stats_csv(stats, options.output, options.encoding) \ No newline at end of file