Initial add

2015-12-07 15:30:02 +01:00 · 2015-12-07 15:30:02 +01:00 · 44b9428685
commit 44b9428685
2 changed files with 176 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,51 @@
 #WordOcc
 A word frequency tool that outputs sorted results in csv format, supporting stop words.
 # Requirements
 - Python 2.6
 - TextBlob https://textblob.readthedocs.org/en/dev/
 # Usage
 ## Basic
 	python wordocc.py a_interesting_text.txt
 Outputs such content in wordocc.csv :
 	top,43
 	image,31
 	sample,29
 	...
 ## Options
 	wordocc.py -h
 	Usage: wordocc.py [options] FILE
 	Options:
 	  -h, --help            show this help message and exit
 	  -s STOP_WORDS, --stop-words=STOP_WORDS
 	                        path to stop word file
 	  -o OUTPUT, --output=OUTPUT
 	                        csv output filename
 	  -e ENCODING, --encoding=ENCODING
 	                        file encoding (default: utf-8)
 ## Stop words
 ### Introduction
 Stop words are words that are not interesting for the statistic study, like articles, conjunctions, etc ...
 You have to provide a file containing those words (one per line). Following files can help :
 - English : http://snowball.tartarus.org/algorithms/english/stop.txt
 - French :http://snowball.tartarus.org/algorithms/french/stop.txt
 ### Usage
 	python wordocc.py -e /home/jdoe/en/stop.txt a_interesting_text.txt
--- a/wordocc.py
+++ b/wordocc.py
@ -0,0 +1,125 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
 WordOcc
 Output csv with word,frequency
 Stop Words support. You may try :
    http://snowball.tartarus.org/algorithms/english/stop.txt
    http://snowball.tartarus.org/algorithms/french/stop.txt
 """
 import os, sys, codecs, re, optparse, logging
 try:
    from textblob import TextBlob
 except ImportError:
    sys.stderr("""textblob library not found.
 See https://textblob.readthedocs.org/en/dev/""")
    sys.exit(1)
 # settings
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 # path stop word file
 STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
 DEFAULT_ENCODING = "utf-8"
 #############################################################################
 def load_stop_words(path, encoding):
    """load stop words list, handling comments with '|'"""
    stop_words = set()
    for line in codecs.open(path, "r", encoding):
        line = line.strip()
        if '|' in line:
            line = line.split('|')[0].strip()
        if line:
            stop_words.add(line.lower())
    return stop_words
 #############################################################################
 def load_text(path, encoding):
    """ handle trailing caret / word cesure"""
    f = codecs.open(path, "r", encoding)
    lines = []
    for line in f:
        if lines and lines[-1].endswith('-\n'):
            lines[-1] = lines[-1].replace('-\n', '').strip() + line
        else:
            lines.append(line)
    f.close()
    return u"".join(lines).replace("\n\n", ". \n")
 #############################################################################
 def process(text, stop_words):
    """use textblob for word parsing and add some processing
    """
    blob = TextBlob(text)
    stats = {}
    for word in blob.words:
        word = word.strip()
        # skip empty and numbers
        if not word \
            or word.lower() in stop_words \
            or word.isdigit() or word[0].isdigit():
            continue
        # Extract word from list item : "1.Chose" -> "Chose"
        if re.search('\w+\.\d+$', word):
            word = word.split('.')[0]
        # thoses characters are not striped by TextBlob
        word = word.replace(u"»", "")
        word = word.replace(u"«", "")
        # handle words starting by "l'" or "d'" : "l'avion" -> "avion"
        if "'" in word:
            word = word[word.find("'")+1:]
        # final filtering
        if len(word) > 1 and not word in stop_words:
            if not word in stats:
                  stats[word] = 0
            stats[word] += 1
    return stats
 #############################################################################
 def save_stats_csv(stats, path, encoding):
    """Write csv stats"""
    # Sort words by top count
    words = stats.keys()
    words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
    f = codecs.open(path, "w", encoding)
    # Output results
    for word in words:
        f.write(u"%s,%d\n" % (word,  stats[word]))
 #############################################################################
 if __name__ == '__main__':
    parser = optparse.OptionParser("usage: %prog [options] FILE")
    parser.add_option("-s", "--stop-words", dest="stop_words",
                      default="", type="string",
                      help="path to stop word file")
    parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
                      type="string", help="csv output filename")
    parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
        type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")
    text = load_text(args[0], options.encoding)
    if options.stop_words:
        stop_words = load_stop_words(options.stop_words, options.encoding)
    else:
        stop_words = set()
    stats = process(text, stop_words)
    save_stats_csv(stats, options.output, options.encoding)