wordocc/wordocc.py
2015-12-07 15:30:02 +01:00

125 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
WordOcc
Output csv with word,frequency
Stop Words support. You may try :
http://snowball.tartarus.org/algorithms/english/stop.txt
http://snowball.tartarus.org/algorithms/french/stop.txt
"""
import os, sys, codecs, re, optparse, logging
try:
from textblob import TextBlob
except ImportError:
sys.stderr("""textblob library not found.
See https://textblob.readthedocs.org/en/dev/""")
sys.exit(1)
# settings
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# path stop word file
STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
DEFAULT_ENCODING = "utf-8"
#############################################################################
def load_stop_words(path, encoding):
"""load stop words list, handling comments with '|'"""
stop_words = set()
for line in codecs.open(path, "r", encoding):
line = line.strip()
if '|' in line:
line = line.split('|')[0].strip()
if line:
stop_words.add(line.lower())
return stop_words
#############################################################################
def load_text(path, encoding):
""" handle trailing caret / word cesure"""
f = codecs.open(path, "r", encoding)
lines = []
for line in f:
if lines and lines[-1].endswith('-\n'):
lines[-1] = lines[-1].replace('-\n', '').strip() + line
else:
lines.append(line)
f.close()
return u"".join(lines).replace("\n\n", ". \n")
#############################################################################
def process(text, stop_words):
"""use textblob for word parsing and add some processing
"""
blob = TextBlob(text)
stats = {}
for word in blob.words:
word = word.strip()
# skip empty and numbers
if not word \
or word.lower() in stop_words \
or word.isdigit() or word[0].isdigit():
continue
# Extract word from list item : "1.Chose" -> "Chose"
if re.search('\w+\.\d+$', word):
word = word.split('.')[0]
# thoses characters are not striped by TextBlob
word = word.replace(u"»", "")
word = word.replace(u"«", "")
# handle words starting by "l'" or "d'" : "l'avion" -> "avion"
if "'" in word:
word = word[word.find("'")+1:]
# final filtering
if len(word) > 1 and not word in stop_words:
if not word in stats:
stats[word] = 0
stats[word] += 1
return stats
#############################################################################
def save_stats_csv(stats, path, encoding):
"""Write csv stats"""
# Sort words by top count
words = stats.keys()
words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
f = codecs.open(path, "w", encoding)
# Output results
for word in words:
f.write(u"%s,%d\n" % (word, stats[word]))
#############################################################################
if __name__ == '__main__':
parser = optparse.OptionParser("usage: %prog [options] FILE")
parser.add_option("-s", "--stop-words", dest="stop_words",
default="", type="string",
help="path to stop word file")
parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
type="string", help="csv output filename")
parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
text = load_text(args[0], options.encoding)
if options.stop_words:
stop_words = load_stop_words(options.stop_words, options.encoding)
else:
stop_words = set()
stats = process(text, stop_words)
save_stats_csv(stats, options.output, options.encoding)