wordocc/wordocc.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
WordOcc

Output csv with word,frequency

Stop Words support. You may try :
    http://snowball.tartarus.org/algorithms/english/stop.txt
    http://snowball.tartarus.org/algorithms/french/stop.txt
"""
import os, sys, codecs, re, optparse, logging

try:
    from textblob import TextBlob
except ImportError:
    sys.stderr("""textblob library not found.
See https://textblob.readthedocs.org/en/dev/""")
    sys.exit(1)

# settings
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

DEFAULT_ENCODING = "utf-8"

DEFAULT_CSV_FILE = "wordocc.csv"

#############################################################################
def load_stop_words(path, encoding):
    """load stop words list, handling comments with '|'"""
    stop_words = set()
    for line in codecs.open(path, "r", encoding):
        line = line.strip()
        if '|' in line:
            line = line.split('|')[0].strip()
        if line:
            stop_words.add(line.lower())
    return stop_words

#############################################################################
def load_text(path, encoding):
    """ handle trailing caret / word cesure"""
    f = codecs.open(path, "r", encoding)
    lines = []
    for line in f:
        if lines and lines[-1].endswith('-\n'):
            lines[-1] = lines[-1].replace('-\n', '').strip() + line
        else:
            lines.append(line)
    f.close()
    return u"".join(lines).replace("\n\n", ". \n")

#############################################################################
def process(text, stop_words):
    """use textblob for word parsing and add some processing
    """
    blob = TextBlob(text)
    stats = {}

    for word in blob.words:
        word = word.strip()
        # skip empty and numbers
        if not word \
            or word.lower() in stop_words \
            or word.isdigit() or word[0].isdigit():
            continue

        # Extract word from list item : "1.Chose" -> "Chose"
        if re.search('\w+\.\d+$', word):
            word = word.split('.')[0]

        # thoses characters are not striped by TextBlob
        word = word.replace(u"»", "")
        word = word.replace(u"«", "")

        # handle words starting by "l'" or "d'" : "l'avion" -> "avion"
        if "'" in word:
            word = word[word.find("'")+1:]

        # final filtering
        if len(word) > 1 and not word in stop_words:
            if not word in stats:
                  stats[word] = 0
            stats[word] += 1
    return stats

#############################################################################
def save_stats_csv(stats, path, encoding):
    """Write csv stats"""
    # Sort words by top count
    words = stats.keys()
    words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
    f = codecs.open(path, "w", encoding)
    # Output results
    for word in words:
        f.write(u"%s,%d\n" % (word,  stats[word]))

#############################################################################
if __name__ == '__main__':
    # Build command line option parser
    parser = optparse.OptionParser("usage: %prog [options] FILE")
    parser.add_option("-s", "--stop-words", dest="stop_words",
                      default="", type="string",
                      help="path to stop word file")

    parser.add_option("-o", "--output", dest="output", default=DEFAULT_CSV_FILE,
        type="string", help="csv output filename (default: %s)" % DEFAULT_CSV_FILE)

    parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
        type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)

    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")

    text = load_text(args[0], options.encoding)

    if options.stop_words:
        stop_words = load_stop_words(options.stop_words, options.encoding)
    else:
        stop_words = set()

    stats = process(text, stop_words)

    save_stats_csv(stats, options.output, options.encoding)