Initial add
This commit is contained in:
		
						commit
						44b9428685
					
				
							
								
								
									
										51
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,51 @@
 | 
				
			|||||||
 | 
					#WordOcc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A word frequency tool that outputs sorted results in csv format, supporting stop words.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Requirements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 - Python 2.6
 | 
				
			||||||
 | 
					 - TextBlob https://textblob.readthedocs.org/en/dev/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Usage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Basic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						python wordocc.py a_interesting_text.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Outputs such content in wordocc.csv :
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						top,43
 | 
				
			||||||
 | 
						image,31
 | 
				
			||||||
 | 
						sample,29
 | 
				
			||||||
 | 
						...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Options
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						wordocc.py -h
 | 
				
			||||||
 | 
						Usage: wordocc.py [options] FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Options:
 | 
				
			||||||
 | 
						  -h, --help            show this help message and exit
 | 
				
			||||||
 | 
						  -s STOP_WORDS, --stop-words=STOP_WORDS
 | 
				
			||||||
 | 
						                        path to stop word file
 | 
				
			||||||
 | 
						  -o OUTPUT, --output=OUTPUT
 | 
				
			||||||
 | 
						                        csv output filename
 | 
				
			||||||
 | 
						  -e ENCODING, --encoding=ENCODING
 | 
				
			||||||
 | 
						                        file encoding (default: utf-8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Stop words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Introduction
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Stop words are words that are not interesting for the statistic study, like articles, conjunctions, etc ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You have to provide a file containing those words (one per line). Following files can help :
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 - English : http://snowball.tartarus.org/algorithms/english/stop.txt
 | 
				
			||||||
 | 
					 - French :http://snowball.tartarus.org/algorithms/french/stop.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Usage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						python wordocc.py -e /home/jdoe/en/stop.txt a_interesting_text.txt
 | 
				
			||||||
							
								
								
									
										125
									
								
								wordocc.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										125
									
								
								wordocc.py
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,125 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					# -*- coding: utf-8 -*-
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					WordOcc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Output csv with word,frequency
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Stop Words support. You may try :
 | 
				
			||||||
 | 
					    http://snowball.tartarus.org/algorithms/english/stop.txt
 | 
				
			||||||
 | 
					    http://snowball.tartarus.org/algorithms/french/stop.txt
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import os, sys, codecs, re, optparse, logging
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    from textblob import TextBlob
 | 
				
			||||||
 | 
					except ImportError:
 | 
				
			||||||
 | 
					    sys.stderr("""textblob library not found.
 | 
				
			||||||
 | 
					See https://textblob.readthedocs.org/en/dev/""")
 | 
				
			||||||
 | 
					    sys.exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# settings
 | 
				
			||||||
 | 
					BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# path stop word file
 | 
				
			||||||
 | 
					STOP_WORDS_PATH = os.path.join(BASE_DIR, 'stop_words.txt')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_ENCODING = "utf-8"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#############################################################################
 | 
				
			||||||
 | 
					def load_stop_words(path, encoding):
 | 
				
			||||||
 | 
					    """load stop words list, handling comments with '|'"""
 | 
				
			||||||
 | 
					    stop_words = set()
 | 
				
			||||||
 | 
					    for line in codecs.open(path, "r", encoding):
 | 
				
			||||||
 | 
					        line = line.strip()
 | 
				
			||||||
 | 
					        if '|' in line:
 | 
				
			||||||
 | 
					            line = line.split('|')[0].strip()
 | 
				
			||||||
 | 
					        if line:
 | 
				
			||||||
 | 
					            stop_words.add(line.lower())
 | 
				
			||||||
 | 
					    return stop_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#############################################################################
 | 
				
			||||||
 | 
					def load_text(path, encoding):
 | 
				
			||||||
 | 
					    """ handle trailing caret / word cesure"""
 | 
				
			||||||
 | 
					    f = codecs.open(path, "r", encoding)
 | 
				
			||||||
 | 
					    lines = []
 | 
				
			||||||
 | 
					    for line in f:
 | 
				
			||||||
 | 
					        if lines and lines[-1].endswith('-\n'):
 | 
				
			||||||
 | 
					            lines[-1] = lines[-1].replace('-\n', '').strip() + line
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            lines.append(line)
 | 
				
			||||||
 | 
					    f.close()
 | 
				
			||||||
 | 
					    return u"".join(lines).replace("\n\n", ". \n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#############################################################################
 | 
				
			||||||
 | 
					def process(text, stop_words):
 | 
				
			||||||
 | 
					    """use textblob for word parsing and add some processing
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    blob = TextBlob(text)
 | 
				
			||||||
 | 
					    stats = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for word in blob.words:
 | 
				
			||||||
 | 
					        word = word.strip()
 | 
				
			||||||
 | 
					        # skip empty and numbers
 | 
				
			||||||
 | 
					        if not word \
 | 
				
			||||||
 | 
					            or word.lower() in stop_words \
 | 
				
			||||||
 | 
					            or word.isdigit() or word[0].isdigit():
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Extract word from list item : "1.Chose" -> "Chose"
 | 
				
			||||||
 | 
					        if re.search('\w+\.\d+$', word):
 | 
				
			||||||
 | 
					            word = word.split('.')[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # thoses characters are not striped by TextBlob
 | 
				
			||||||
 | 
					        word = word.replace(u"»", "")
 | 
				
			||||||
 | 
					        word = word.replace(u"«", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # handle words starting by "l'" or "d'" : "l'avion" -> "avion"
 | 
				
			||||||
 | 
					        if "'" in word:
 | 
				
			||||||
 | 
					            word = word[word.find("'")+1:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # final filtering
 | 
				
			||||||
 | 
					        if len(word) > 1 and not word in stop_words:
 | 
				
			||||||
 | 
					            if not word in stats:
 | 
				
			||||||
 | 
					                  stats[word] = 0
 | 
				
			||||||
 | 
					            stats[word] += 1
 | 
				
			||||||
 | 
					    return stats
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#############################################################################
 | 
				
			||||||
 | 
					def save_stats_csv(stats, path, encoding):
 | 
				
			||||||
 | 
					    """Write csv stats"""
 | 
				
			||||||
 | 
					    # Sort words by top count
 | 
				
			||||||
 | 
					    words = stats.keys()
 | 
				
			||||||
 | 
					    words.sort(lambda x, y: cmp(stats[x], stats[y]), reverse=True)
 | 
				
			||||||
 | 
					    f = codecs.open(path, "w", encoding)
 | 
				
			||||||
 | 
					    # Output results
 | 
				
			||||||
 | 
					    for word in words:
 | 
				
			||||||
 | 
					        f.write(u"%s,%d\n" % (word,  stats[word]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#############################################################################
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    parser = optparse.OptionParser("usage: %prog [options] FILE")
 | 
				
			||||||
 | 
					    parser.add_option("-s", "--stop-words", dest="stop_words",
 | 
				
			||||||
 | 
					                      default="", type="string",
 | 
				
			||||||
 | 
					                      help="path to stop word file")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_option("-o", "--output", dest="output", default="wordocc.csv",
 | 
				
			||||||
 | 
					                      type="string", help="csv output filename")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_option("-e", "--encoding", dest="encoding", default=DEFAULT_ENCODING,
 | 
				
			||||||
 | 
					        type="string", help="file encoding (default: %s)" % DEFAULT_ENCODING)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    (options, args) = parser.parse_args()
 | 
				
			||||||
 | 
					    if len(args) != 1:
 | 
				
			||||||
 | 
					        parser.error("incorrect number of arguments")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    text = load_text(args[0], options.encoding)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if options.stop_words:
 | 
				
			||||||
 | 
					        stop_words = load_stop_words(options.stop_words, options.encoding)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        stop_words = set()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    stats = process(text, stop_words)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    save_stats_csv(stats, options.output, options.encoding)
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user