wordocc/wordocc.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
WordOcc

Output csv with word,frequency

Stop Words support. You may try :
    http://snowball.tartarus.org/algorithms/english/stop.txt
    http://snowball.tartarus.org/algorithms/french/stop.txt
"""
import os, sys, codecs, re, optparse, logging

try:
    from textblob import TextBlob
except ImportError:
    sys.stderr("""textblob library not found.
See https://textblob.readthedocs.org/en/dev/""")
    sys.exit(1)

# settings
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

# path stop word file
STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')

DEFAULT_ENCODING = "utf-8"

#############################################################################
def load_stop_words(path, encoding):
    """load stop words list, handling comments with '|'"""
    stop_words = set()
    for line in codecs.open(path, "r", encoding):
        line = line.strip()
        if '|' in line:
            line = line.split('|')[0].strip()
        if line:
            stop_words.add(line.lower())
    return stop_words

#############################################################################
def load_text(path, encoding):
    """ handle trailing caret / word cesure"""
    f = codecs.open(path, "r", encoding)
    lines = []
    for line in f:
        if lines and lines[-1].endswith('-\n'):
            lines[-1] = lines[-1].replace('-\n', '').strip() + line
        else:
            lines.append(line)
    f.close()
    return u"".join(lines).replace("\n\n", ". \n")

#############################################################################
def process(text, stop_words):
    """use textblob for word parsing and add some processing
    """
    blob = TextBlob(text)
    stats = {}

    for word in blob.words:
        word = word.strip()
        # skip empty and numbers
        if not word \
            or word.lower() in stop_words \
            or word.isdigit() or word[0].isdigit():
            continue

        # Extract word from list item : "1.Chose" -> "Chose"
        if re.search('\w+\.\d+$', word):
            word = word.split('.')[0]

        # thoses characters are not striped by TextBlob
        word = word.replace(u"»", "")
        word = word.replace(u"«", "")

        # handle words starting by "l'" or "d'" : "l'avion" -> "avion"
        if "'" in word:
            word = word[word.find("'")+1:]

        # final filtering
        if len(word) > 1 and not word in stop_words:
            if not word in stats:
                  stats[word] = 0
            stats[word] += 1
    return stats

#############################################################################
def save_stats_csv(stats, path, encoding):
    """Write csv stats"""
    # Sort words by top count
    words = stats.keys()
    words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
    f = codecs.open(path, "w", encoding)
    # Output results
    for word in words:
        f.write(u"%s,%d\n" % (word,  stats[word]))

#############################################################################
if __name__ == '__main__':
    parser = optparse.OptionParser("usage: %prog [options] FILE")
    parser.add_option("-s", "--stop-words", dest="stop_words",
                      default="", type="string",
                      help="path to stop word file")

    parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
                      type="string", help="csv output filename")

    parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
        type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)

    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")

    text = load_text(args[0], options.encoding)

    if options.stop_words:
        stop_words = load_stop_words(options.stop_words, options.encoding)
    else:
        stop_words = set()

    stats = process(text, stop_words)

    save_stats_csv(stats, options.output, options.encoding)
Initial add 2015-12-08 01:30:02 +11:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`"""`
			`WordOcc`

			`Output csv with word,frequency`

			`Stop Words support. You may try :`
			`http://snowball.tartarus.org/algorithms/english/stop.txt`
			`http://snowball.tartarus.org/algorithms/french/stop.txt`
			`"""`
			`import os, sys, codecs, re, optparse, logging`

			`try:`
			`from textblob import TextBlob`
			`except ImportError:`
			`sys.stderr("""textblob library not found.`
			`See https://textblob.readthedocs.org/en/dev/""")`
			`sys.exit(1)`

			`# settings`
			`BASE_DIR = os.path.dirname(os.path.abspath(__file__))`

			`# path stop word file`
			`STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')`

			`DEFAULT_ENCODING = "utf-8"`

			`#############################################################################`
			`def load_stop_words(path, encoding):`
			`"""load stop words list, handling comments with '\|'"""`
			`stop_words = set()`
			`for line in codecs.open(path, "r", encoding):`
			`line = line.strip()`
			`if '\|' in line:`
			`line = line.split('\|')[0].strip()`
			`if line:`
			`stop_words.add(line.lower())`
			`return stop_words`

			`#############################################################################`
			`def load_text(path, encoding):`
			`""" handle trailing caret / word cesure"""`
			`f = codecs.open(path, "r", encoding)`
			`lines = []`
			`for line in f:`
			`if lines and lines[-1].endswith('-\n'):`
			`lines[-1] = lines[-1].replace('-\n', '').strip() + line`
			`else:`
			`lines.append(line)`
			`f.close()`
			`return u"".join(lines).replace("\n\n", ". \n")`

			`#############################################################################`
			`def process(text, stop_words):`
			`"""use textblob for word parsing and add some processing`
			`"""`
			`blob = TextBlob(text)`
			`stats = {}`

			`for word in blob.words:`
			`word = word.strip()`
			`# skip empty and numbers`
			`if not word \`
			`or word.lower() in stop_words \`
			`or word.isdigit() or word[0].isdigit():`
			`continue`

			`# Extract word from list item : "1.Chose" -> "Chose"`
			`if re.search('\w+\.\d+$', word):`
			`word = word.split('.')[0]`

			`# thoses characters are not striped by TextBlob`
			`word = word.replace(u"»", "")`
			`word = word.replace(u"«", "")`

			`# handle words starting by "l'" or "d'" : "l'avion" -> "avion"`
			`if "'" in word:`
			`word = word[word.find("'")+1:]`

			`# final filtering`
			`if len(word) > 1 and not word in stop_words:`
			`if not word in stats:`
			`stats[word] = 0`
			`stats[word] += 1`
			`return stats`

			`#############################################################################`
			`def save_stats_csv(stats, path, encoding):`
			`"""Write csv stats"""`
			`# Sort words by top count`
			`words = stats.keys()`
			`words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)`
			`f = codecs.open(path, "w", encoding)`
			`# Output results`
			`for word in words:`
			`f.write(u"%s,%d\n" % (word, stats[word]))`

			`#############################################################################`
			`if __name__ == '__main__':`
			`parser = optparse.OptionParser("usage: %prog [options] FILE")`
			`parser.add_option("-s", "--stop-words", dest="stop_words",`
			`default="", type="string",`
			`help="path to stop word file")`

			`parser.add_option("-o", "--output", dest="output", default="wordocc.csv",`
			`type="string", help="csv output filename")`

			`parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,`
			`type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)`

			`(options, args) = parser.parse_args()`
			`if len(args) != 1:`
			`parser.error("incorrect number of arguments")`

			`text = load_text(args[0], options.encoding)`

			`if options.stop_words:`
			`stop_words = load_stop_words(options.stop_words, options.encoding)`
			`else:`
			`stop_words = set()`

			`stats = process(text, stop_words)`

			`save_stats_csv(stats, options.output, options.encoding)`