Initial add
This commit is contained in:
commit
44b9428685
51
README.md
Normal file
51
README.md
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#WordOcc
|
||||||
|
|
||||||
|
A word frequency tool that outputs sorted results in csv format, supporting stop words.
|
||||||
|
|
||||||
|
# Requirements
|
||||||
|
|
||||||
|
- Python 2.6
|
||||||
|
- TextBlob https://textblob.readthedocs.org/en/dev/
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
## Basic
|
||||||
|
|
||||||
|
python wordocc.py a_interesting_text.txt
|
||||||
|
|
||||||
|
Outputs such content in wordocc.csv :
|
||||||
|
|
||||||
|
top,43
|
||||||
|
image,31
|
||||||
|
sample,29
|
||||||
|
...
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
wordocc.py -h
|
||||||
|
Usage: wordocc.py [options] FILE
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-s STOP_WORDS, --stop-words=STOP_WORDS
|
||||||
|
path to stop word file
|
||||||
|
-o OUTPUT, --output=OUTPUT
|
||||||
|
csv output filename
|
||||||
|
-e ENCODING, --encoding=ENCODING
|
||||||
|
file encoding (default: utf-8)
|
||||||
|
|
||||||
|
|
||||||
|
## Stop words
|
||||||
|
|
||||||
|
### Introduction
|
||||||
|
|
||||||
|
Stop words are words that are not interesting for the statistic study, like articles, conjunctions, etc ...
|
||||||
|
|
||||||
|
You have to provide a file containing those words (one per line). Following files can help :
|
||||||
|
|
||||||
|
- English : http://snowball.tartarus.org/algorithms/english/stop.txt
|
||||||
|
- French :http://snowball.tartarus.org/algorithms/french/stop.txt
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
python wordocc.py -e /home/jdoe/en/stop.txt a_interesting_text.txt
|
125
wordocc.py
Executable file
125
wordocc.py
Executable file
@ -0,0 +1,125 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
WordOcc
|
||||||
|
|
||||||
|
Output csv with word,frequency
|
||||||
|
|
||||||
|
Stop Words support. You may try :
|
||||||
|
http://snowball.tartarus.org/algorithms/english/stop.txt
|
||||||
|
http://snowball.tartarus.org/algorithms/french/stop.txt
|
||||||
|
"""
|
||||||
|
import os, sys, codecs, re, optparse, logging
|
||||||
|
|
||||||
|
try:
|
||||||
|
from textblob import TextBlob
|
||||||
|
except ImportError:
|
||||||
|
sys.stderr("""textblob library not found.
|
||||||
|
See https://textblob.readthedocs.org/en/dev/""")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# settings
|
||||||
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# path stop word file
|
||||||
|
STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
|
||||||
|
|
||||||
|
DEFAULT_ENCODING = "utf-8"
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
def load_stop_words(path, encoding):
|
||||||
|
"""load stop words list, handling comments with '|'"""
|
||||||
|
stop_words = set()
|
||||||
|
for line in codecs.open(path, "r", encoding):
|
||||||
|
line = line.strip()
|
||||||
|
if '|' in line:
|
||||||
|
line = line.split('|')[0].strip()
|
||||||
|
if line:
|
||||||
|
stop_words.add(line.lower())
|
||||||
|
return stop_words
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
def load_text(path, encoding):
|
||||||
|
""" handle trailing caret / word cesure"""
|
||||||
|
f = codecs.open(path, "r", encoding)
|
||||||
|
lines = []
|
||||||
|
for line in f:
|
||||||
|
if lines and lines[-1].endswith('-\n'):
|
||||||
|
lines[-1] = lines[-1].replace('-\n', '').strip() + line
|
||||||
|
else:
|
||||||
|
lines.append(line)
|
||||||
|
f.close()
|
||||||
|
return u"".join(lines).replace("\n\n", ". \n")
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
def process(text, stop_words):
|
||||||
|
"""use textblob for word parsing and add some processing
|
||||||
|
"""
|
||||||
|
blob = TextBlob(text)
|
||||||
|
stats = {}
|
||||||
|
|
||||||
|
for word in blob.words:
|
||||||
|
word = word.strip()
|
||||||
|
# skip empty and numbers
|
||||||
|
if not word \
|
||||||
|
or word.lower() in stop_words \
|
||||||
|
or word.isdigit() or word[0].isdigit():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract word from list item : "1.Chose" -> "Chose"
|
||||||
|
if re.search('\w+\.\d+$', word):
|
||||||
|
word = word.split('.')[0]
|
||||||
|
|
||||||
|
# thoses characters are not striped by TextBlob
|
||||||
|
word = word.replace(u"»", "")
|
||||||
|
word = word.replace(u"«", "")
|
||||||
|
|
||||||
|
# handle words starting by "l'" or "d'" : "l'avion" -> "avion"
|
||||||
|
if "'" in word:
|
||||||
|
word = word[word.find("'")+1:]
|
||||||
|
|
||||||
|
# final filtering
|
||||||
|
if len(word) > 1 and not word in stop_words:
|
||||||
|
if not word in stats:
|
||||||
|
stats[word] = 0
|
||||||
|
stats[word] += 1
|
||||||
|
return stats
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
def save_stats_csv(stats, path, encoding):
|
||||||
|
"""Write csv stats"""
|
||||||
|
# Sort words by top count
|
||||||
|
words = stats.keys()
|
||||||
|
words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
|
||||||
|
f = codecs.open(path, "w", encoding)
|
||||||
|
# Output results
|
||||||
|
for word in words:
|
||||||
|
f.write(u"%s,%d\n" % (word, stats[word]))
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = optparse.OptionParser("usage: %prog [options] FILE")
|
||||||
|
parser.add_option("-s", "--stop-words", dest="stop_words",
|
||||||
|
default="", type="string",
|
||||||
|
help="path to stop word file")
|
||||||
|
|
||||||
|
parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
|
||||||
|
type="string", help="csv output filename")
|
||||||
|
|
||||||
|
parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
|
||||||
|
type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
|
||||||
|
|
||||||
|
(options, args) = parser.parse_args()
|
||||||
|
if len(args) != 1:
|
||||||
|
parser.error("incorrect number of arguments")
|
||||||
|
|
||||||
|
text = load_text(args[0], options.encoding)
|
||||||
|
|
||||||
|
if options.stop_words:
|
||||||
|
stop_words = load_stop_words(options.stop_words, options.encoding)
|
||||||
|
else:
|
||||||
|
stop_words = set()
|
||||||
|
|
||||||
|
stats = process(text, stop_words)
|
||||||
|
|
||||||
|
save_stats_csv(stats, options.output, options.encoding)
|
Loading…
Reference in New Issue
Block a user