125 lines
3.9 KiB
Python
125 lines
3.9 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
WordOcc
|
||
|
|
||
|
Output csv with word,frequency
|
||
|
|
||
|
Stop Words support. You may try :
|
||
|
http://snowball.tartarus.org/algorithms/english/stop.txt
|
||
|
http://snowball.tartarus.org/algorithms/french/stop.txt
|
||
|
"""
|
||
|
import os, sys, codecs, re, optparse, logging
|
||
|
|
||
|
try:
|
||
|
from textblob import TextBlob
|
||
|
except ImportError:
|
||
|
sys.stderr("""textblob library not found.
|
||
|
See https://textblob.readthedocs.org/en/dev/""")
|
||
|
sys.exit(1)
|
||
|
|
||
|
# settings
|
||
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||
|
|
||
|
# path stop word file
|
||
|
STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
|
||
|
|
||
|
DEFAULT_ENCODING = "utf-8"
|
||
|
|
||
|
#############################################################################
|
||
|
def load_stop_words(path, encoding):
|
||
|
"""load stop words list, handling comments with '|'"""
|
||
|
stop_words = set()
|
||
|
for line in codecs.open(path, "r", encoding):
|
||
|
line = line.strip()
|
||
|
if '|' in line:
|
||
|
line = line.split('|')[0].strip()
|
||
|
if line:
|
||
|
stop_words.add(line.lower())
|
||
|
return stop_words
|
||
|
|
||
|
#############################################################################
|
||
|
def load_text(path, encoding):
|
||
|
""" handle trailing caret / word cesure"""
|
||
|
f = codecs.open(path, "r", encoding)
|
||
|
lines = []
|
||
|
for line in f:
|
||
|
if lines and lines[-1].endswith('-\n'):
|
||
|
lines[-1] = lines[-1].replace('-\n', '').strip() + line
|
||
|
else:
|
||
|
lines.append(line)
|
||
|
f.close()
|
||
|
return u"".join(lines).replace("\n\n", ". \n")
|
||
|
|
||
|
#############################################################################
|
||
|
def process(text, stop_words):
|
||
|
"""use textblob for word parsing and add some processing
|
||
|
"""
|
||
|
blob = TextBlob(text)
|
||
|
stats = {}
|
||
|
|
||
|
for word in blob.words:
|
||
|
word = word.strip()
|
||
|
# skip empty and numbers
|
||
|
if not word \
|
||
|
or word.lower() in stop_words \
|
||
|
or word.isdigit() or word[0].isdigit():
|
||
|
continue
|
||
|
|
||
|
# Extract word from list item : "1.Chose" -> "Chose"
|
||
|
if re.search('\w+\.\d+$', word):
|
||
|
word = word.split('.')[0]
|
||
|
|
||
|
# thoses characters are not striped by TextBlob
|
||
|
word = word.replace(u"»", "")
|
||
|
word = word.replace(u"«", "")
|
||
|
|
||
|
# handle words starting by "l'" or "d'" : "l'avion" -> "avion"
|
||
|
if "'" in word:
|
||
|
word = word[word.find("'")+1:]
|
||
|
|
||
|
# final filtering
|
||
|
if len(word) > 1 and not word in stop_words:
|
||
|
if not word in stats:
|
||
|
stats[word] = 0
|
||
|
stats[word] += 1
|
||
|
return stats
|
||
|
|
||
|
#############################################################################
|
||
|
def save_stats_csv(stats, path, encoding):
|
||
|
"""Write csv stats"""
|
||
|
# Sort words by top count
|
||
|
words = stats.keys()
|
||
|
words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
|
||
|
f = codecs.open(path, "w", encoding)
|
||
|
# Output results
|
||
|
for word in words:
|
||
|
f.write(u"%s,%d\n" % (word, stats[word]))
|
||
|
|
||
|
#############################################################################
|
||
|
if __name__ == '__main__':
|
||
|
parser = optparse.OptionParser("usage: %prog [options] FILE")
|
||
|
parser.add_option("-s", "--stop-words", dest="stop_words",
|
||
|
default="", type="string",
|
||
|
help="path to stop word file")
|
||
|
|
||
|
parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
|
||
|
type="string", help="csv output filename")
|
||
|
|
||
|
parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
|
||
|
type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
|
||
|
|
||
|
(options, args) = parser.parse_args()
|
||
|
if len(args) != 1:
|
||
|
parser.error("incorrect number of arguments")
|
||
|
|
||
|
text = load_text(args[0], options.encoding)
|
||
|
|
||
|
if options.stop_words:
|
||
|
stop_words = load_stop_words(options.stop_words, options.encoding)
|
||
|
else:
|
||
|
stop_words = set()
|
||
|
|
||
|
stats = process(text, stop_words)
|
||
|
|
||
|
save_stats_csv(stats, options.output, options.encoding)
|