#!/usr/bin/env python # -*- coding: utf-8 -*- """ WordOcc Output csv with word,frequency Stop Words support. You may try : http://snowball.tartarus.org/algorithms/english/stop.txt http://snowball.tartarus.org/algorithms/french/stop.txt """ import os, sys, codecs, re, optparse, logging try: from textblob import TextBlob except ImportError: sys.stderr("""textblob library not found. See https://textblob.readthedocs.org/en/dev/""") sys.exit(1) # settings BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # path stop word file STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt') DEFAULT_ENCODING = "utf-8" ############################################################################# def load_stop_words(path, encoding): """load stop words list, handling comments with '|'""" stop_words = set() for line in codecs.open(path, "r", encoding): line = line.strip() if '|' in line: line = line.split('|')[0].strip() if line: stop_words.add(line.lower()) return stop_words ############################################################################# def load_text(path, encoding): """ handle trailing caret / word cesure""" f = codecs.open(path, "r", encoding) lines = [] for line in f: if lines and lines[-1].endswith('-\n'): lines[-1] = lines[-1].replace('-\n', '').strip() + line else: lines.append(line) f.close() return u"".join(lines).replace("\n\n", ". \n") ############################################################################# def process(text, stop_words): """use textblob for word parsing and add some processing """ blob = TextBlob(text) stats = {} for word in blob.words: word = word.strip() # skip empty and numbers if not word \ or word.lower() in stop_words \ or word.isdigit() or word[0].isdigit(): continue # Extract word from list item : "1.Chose" -> "Chose" if re.search('\w+\.\d+$', word): word = word.split('.')[0] # thoses characters are not striped by TextBlob word = word.replace(u"»", "") word = word.replace(u"«", "") # handle words starting by "l'" or "d'" : "l'avion" -> "avion" if "'" in word: word = word[word.find("'")+1:] # final filtering if len(word) > 1 and not word in stop_words: if not word in stats: stats[word] = 0 stats[word] += 1 return stats ############################################################################# def save_stats_csv(stats, path, encoding): """Write csv stats""" # Sort words by top count words = stats.keys() words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True) f = codecs.open(path, "w", encoding) # Output results for word in words: f.write(u"%s,%d\n" % (word, stats[word])) ############################################################################# if __name__ == '__main__': parser = optparse.OptionParser("usage: %prog [options] FILE") parser.add_option("-s", "--stop-words", dest="stop_words", default="", type="string", help="path to stop word file") parser.add_option("-o", "--output", dest="output", default="wordocc.csv", type="string", help="csv output filename") parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING, type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING) (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments") text = load_text(args[0], options.encoding) if options.stop_words: stop_words = load_stop_words(options.stop_words, options.encoding) else: stop_words = set() stats = process(text, stop_words) save_stats_csv(stats, options.output, options.encoding)