Initial add
This commit is contained in:
commit
44b9428685
51
README.md
Normal file
51
README.md
Normal file
@ -0,0 +1,51 @@
|
||||
#WordOcc
|
||||
|
||||
A word frequency tool that outputs sorted results in csv format, supporting stop words.
|
||||
|
||||
# Requirements
|
||||
|
||||
- Python 2.6
|
||||
- TextBlob https://textblob.readthedocs.org/en/dev/
|
||||
|
||||
# Usage
|
||||
|
||||
## Basic
|
||||
|
||||
python wordocc.py a_interesting_text.txt
|
||||
|
||||
Outputs such content in wordocc.csv :
|
||||
|
||||
top,43
|
||||
image,31
|
||||
sample,29
|
||||
...
|
||||
|
||||
## Options
|
||||
|
||||
wordocc.py -h
|
||||
Usage: wordocc.py [options] FILE
|
||||
|
||||
Options:
|
||||
-h, --help show this help message and exit
|
||||
-s STOP_WORDS, --stop-words=STOP_WORDS
|
||||
path to stop word file
|
||||
-o OUTPUT, --output=OUTPUT
|
||||
csv output filename
|
||||
-e ENCODING, --encoding=ENCODING
|
||||
file encoding (default: utf-8)
|
||||
|
||||
|
||||
## Stop words
|
||||
|
||||
### Introduction
|
||||
|
||||
Stop words are words that are not interesting for the statistic study, like articles, conjunctions, etc ...
|
||||
|
||||
You have to provide a file containing those words (one per line). Following files can help :
|
||||
|
||||
- English : http://snowball.tartarus.org/algorithms/english/stop.txt
|
||||
- French :http://snowball.tartarus.org/algorithms/french/stop.txt
|
||||
|
||||
### Usage
|
||||
|
||||
python wordocc.py -e /home/jdoe/en/stop.txt a_interesting_text.txt
|
125
wordocc.py
Executable file
125
wordocc.py
Executable file
@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
WordOcc
|
||||
|
||||
Output csv with word,frequency
|
||||
|
||||
Stop Words support. You may try :
|
||||
http://snowball.tartarus.org/algorithms/english/stop.txt
|
||||
http://snowball.tartarus.org/algorithms/french/stop.txt
|
||||
"""
|
||||
import os, sys, codecs, re, optparse, logging
|
||||
|
||||
try:
|
||||
from textblob import TextBlob
|
||||
except ImportError:
|
||||
sys.stderr("""textblob library not found.
|
||||
See https://textblob.readthedocs.org/en/dev/""")
|
||||
sys.exit(1)
|
||||
|
||||
# settings
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# path stop word file
|
||||
STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
|
||||
|
||||
DEFAULT_ENCODING = "utf-8"
|
||||
|
||||
#############################################################################
|
||||
def load_stop_words(path, encoding):
|
||||
"""load stop words list, handling comments with '|'"""
|
||||
stop_words = set()
|
||||
for line in codecs.open(path, "r", encoding):
|
||||
line = line.strip()
|
||||
if '|' in line:
|
||||
line = line.split('|')[0].strip()
|
||||
if line:
|
||||
stop_words.add(line.lower())
|
||||
return stop_words
|
||||
|
||||
#############################################################################
|
||||
def load_text(path, encoding):
|
||||
""" handle trailing caret / word cesure"""
|
||||
f = codecs.open(path, "r", encoding)
|
||||
lines = []
|
||||
for line in f:
|
||||
if lines and lines[-1].endswith('-\n'):
|
||||
lines[-1] = lines[-1].replace('-\n', '').strip() + line
|
||||
else:
|
||||
lines.append(line)
|
||||
f.close()
|
||||
return u"".join(lines).replace("\n\n", ". \n")
|
||||
|
||||
#############################################################################
|
||||
def process(text, stop_words):
|
||||
"""use textblob for word parsing and add some processing
|
||||
"""
|
||||
blob = TextBlob(text)
|
||||
stats = {}
|
||||
|
||||
for word in blob.words:
|
||||
word = word.strip()
|
||||
# skip empty and numbers
|
||||
if not word \
|
||||
or word.lower() in stop_words \
|
||||
or word.isdigit() or word[0].isdigit():
|
||||
continue
|
||||
|
||||
# Extract word from list item : "1.Chose" -> "Chose"
|
||||
if re.search('\w+\.\d+$', word):
|
||||
word = word.split('.')[0]
|
||||
|
||||
# thoses characters are not striped by TextBlob
|
||||
word = word.replace(u"»", "")
|
||||
word = word.replace(u"«", "")
|
||||
|
||||
# handle words starting by "l'" or "d'" : "l'avion" -> "avion"
|
||||
if "'" in word:
|
||||
word = word[word.find("'")+1:]
|
||||
|
||||
# final filtering
|
||||
if len(word) > 1 and not word in stop_words:
|
||||
if not word in stats:
|
||||
stats[word] = 0
|
||||
stats[word] += 1
|
||||
return stats
|
||||
|
||||
#############################################################################
|
||||
def save_stats_csv(stats, path, encoding):
|
||||
"""Write csv stats"""
|
||||
# Sort words by top count
|
||||
words = stats.keys()
|
||||
words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
|
||||
f = codecs.open(path, "w", encoding)
|
||||
# Output results
|
||||
for word in words:
|
||||
f.write(u"%s,%d\n" % (word, stats[word]))
|
||||
|
||||
#############################################################################
|
||||
if __name__ == '__main__':
|
||||
parser = optparse.OptionParser("usage: %prog [options] FILE")
|
||||
parser.add_option("-s", "--stop-words", dest="stop_words",
|
||||
default="", type="string",
|
||||
help="path to stop word file")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
|
||||
type="string", help="csv output filename")
|
||||
|
||||
parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
|
||||
type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
if len(args) != 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
text = load_text(args[0], options.encoding)
|
||||
|
||||
if options.stop_words:
|
||||
stop_words = load_stop_words(options.stop_words, options.encoding)
|
||||
else:
|
||||
stop_words = set()
|
||||
|
||||
stats = process(text, stop_words)
|
||||
|
||||
save_stats_csv(stats, options.output, options.encoding)
|
Loading…
Reference in New Issue
Block a user