From 44b942868519733f79baee8b8e529dd1d600c941 Mon Sep 17 00:00:00 2001
From: Mutah <fgirault@gmail.com>
Date: Mon, 7 Dec 2015 15:30:02 +0100
Subject: [PATCH] Initial add

---
 README.md  |  51 ++++++++++++++++++++++
 wordocc.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 README.md
 create mode 100755 wordocc.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e889937
--- /dev/null
+++ b/README.md
@@ -0,0 +1,51 @@
+#WordOcc
+
+A word frequency tool that outputs sorted results in csv format, supporting stop words.
+
+# Requirements
+
+ - Python 2.6
+ - TextBlob https://textblob.readthedocs.org/en/dev/
+
+# Usage
+
+## Basic
+
+	python wordocc.py a_interesting_text.txt
+
+Outputs such content in wordocc.csv :
+
+	top,43
+	image,31
+	sample,29
+	...
+
+## Options
+
+	wordocc.py -h
+	Usage: wordocc.py [options] FILE
+
+	Options:
+	  -h, --help            show this help message and exit
+	  -s STOP_WORDS, --stop-words=STOP_WORDS
+	                        path to stop word file
+	  -o OUTPUT, --output=OUTPUT
+	                        csv output filename
+	  -e ENCODING, --encoding=ENCODING
+	                        file encoding (default: utf-8)
+
+
+## Stop words
+
+### Introduction
+
+Stop words are words that are not interesting for the statistic study, like articles, conjunctions, etc ...
+
+You have to provide a file containing those words (one per line). Following files can help :
+
+ - English : http://snowball.tartarus.org/algorithms/english/stop.txt
+ - French :http://snowball.tartarus.org/algorithms/french/stop.txt
+
+### Usage
+
+	python wordocc.py -e /home/jdoe/en/stop.txt a_interesting_text.txt
\ No newline at end of file
diff --git a/wordocc.py b/wordocc.py
new file mode 100755
index 0000000..5858901
--- /dev/null
+++ b/wordocc.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+WordOcc
+
+Output csv with word,frequency
+
+Stop Words support. You may try :
+    http://snowball.tartarus.org/algorithms/english/stop.txt
+    http://snowball.tartarus.org/algorithms/french/stop.txt
+"""
+import os, sys, codecs, re, optparse, logging
+
+try:
+    from textblob import TextBlob
+except ImportError:
+    sys.stderr("""textblob library not found.
+See https://textblob.readthedocs.org/en/dev/""")
+    sys.exit(1)
+
+# settings
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# path stop word file
+STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
+
+DEFAULT_ENCODING = "utf-8"
+
+#############################################################################
+def load_stop_words(path, encoding):
+    """load stop words list, handling comments with '|'"""
+    stop_words = set()
+    for line in codecs.open(path, "r", encoding):
+        line = line.strip()
+        if '|' in line:
+            line = line.split('|')[0].strip()
+        if line:
+            stop_words.add(line.lower())
+    return stop_words
+
+#############################################################################
+def load_text(path, encoding):
+    """ handle trailing caret / word cesure"""
+    f = codecs.open(path, "r", encoding)
+    lines = []
+    for line in f:
+        if lines and lines[-1].endswith('-\n'):
+            lines[-1] = lines[-1].replace('-\n', '').strip() + line
+        else:
+            lines.append(line)
+    f.close()
+    return u"".join(lines).replace("\n\n", ". \n")
+
+#############################################################################
+def process(text, stop_words):
+    """use textblob for word parsing and add some processing
+    """
+    blob = TextBlob(text)
+    stats = {}
+
+    for word in blob.words:
+        word = word.strip()
+        # skip empty and numbers
+        if not word \
+            or word.lower() in stop_words \
+            or word.isdigit() or word[0].isdigit():
+            continue
+
+        # Extract word from list item : "1.Chose" -> "Chose"
+        if re.search('\w+\.\d+$', word):
+            word = word.split('.')[0]
+
+        # thoses characters are not striped by TextBlob
+        word = word.replace(u"»", "")
+        word = word.replace(u"«", "")
+
+        # handle words starting by "l'" or "d'" : "l'avion" -> "avion"
+        if "'" in word:
+            word = word[word.find("'")+1:]
+
+        # final filtering
+        if len(word) > 1 and not word in stop_words:
+            if not word in stats:
+                  stats[word] = 0
+            stats[word] += 1
+    return stats
+
+#############################################################################
+def save_stats_csv(stats, path, encoding):
+    """Write csv stats"""
+    # Sort words by top count
+    words = stats.keys()
+    words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
+    f = codecs.open(path, "w", encoding)
+    # Output results
+    for word in words:
+        f.write(u"%s,%d\n" % (word,  stats[word]))
+
+#############################################################################
+if __name__ == '__main__':
+    parser = optparse.OptionParser("usage: %prog [options] FILE")
+    parser.add_option("-s", "--stop-words", dest="stop_words",
+                      default="", type="string",
+                      help="path to stop word file")
+
+    parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
+                      type="string", help="csv output filename")
+
+    parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
+        type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
+
+    (options, args) = parser.parse_args()
+    if len(args) != 1:
+        parser.error("incorrect number of arguments")
+
+    text = load_text(args[0], options.encoding)
+
+    if options.stop_words:
+        stop_words = load_stop_words(options.stop_words, options.encoding)
+    else:
+        stop_words = set()
+
+    stats = process(text, stop_words)
+
+    save_stats_csv(stats, options.output, options.encoding)
\ No newline at end of file