"""
Ryan Pepper (2018)
process.py
Module containing functions that process word frequnecy dicts
into a report format.
"""
import glob
import os
from FolderAnalyse import fileparser as fp
[docs]def top_frequencies(freq_dict, nterms=10):
"""
top_frequencies(freq_dict, name, nterms)
Returns the first nterms in the dictionary.
Input:
freq_dict, dict:
Dictionary of word frequencies.
nterms, int;
Number of word frequencies in returned dictionary.
Output:
dict:
The reduced size dictionary.
Note:
This wrapper is needed just to handle
files with less than 10 words.
"""
items = list(freq_dict.items())
if len(items) < nterms:
return dict(items)
else:
return dict(items[:nterms])
def _dict_to_text(freq_dict):
"""
_dict_to_text(freq_dict):
Internal routine to print items and values
in a sorted dictionary, to avoid duplicating
this code in process_file and in process_dir.
Inputs:
freq_dict, dict
Dictionary of words and frequencies.
Outputs:
str:
Text block with numerated key-value pairs.
"""
stats_text = ""
for i, (key, value) in enumerate(freq_dict.items()):
stats_text += f"{i+1}. {key}, {value}\n"
stats_text += '\n'
return stats_text
[docs]def underline(title):
"""
underline(title)
Returns title but with another line matching the
length as in restructured text format.
Inputs:
title, str:
Title to be underlined.
Outputs:
str:
Multiline string with underlining.
Example:
>>> print(FolderAnalyse.process.underline('Hello'))
Hello
-----
"""
return title + '\n' + '-'*(len(title)) + '\n'
[docs]def process_file(filename, N=10, case_sensitive=False):
"""
process_file(filename, N=10, case_sensitive=False)
Process a file and return some text giving the top
N words in the file and the original frequency dictionary.
Inputs:
filename, str:
File to be processed.
N, int:
Number of top frequencies to add to report
case_sensitive, bool:
Whether processing should be case sensitive or not, i.e.
if 'the' is the same as 'The' for counting word frequencies.
Outputs:
str:
Textual report about word frequency in files.
dict:
Total word frequency dict
dict:
Reduced wrod frequency dict with N terms.
Example:
>>> f = open('test.txt', 'w')
>>> f.write("The quick brown fox jumped over the lazy dog.")
>>> f.close()
>>> text, freq_dict = FolderAnalyse.process.process_file("test.txt")
>>> print(freq_dict['the'])
2
"""
stats_text = underline(f"File \"{filename}\" Top {N} Word Frequencies")
frequency_dict = fp.parse(filename, case_sensitive=case_sensitive,
sort=True)
top_freq = top_frequencies(frequency_dict, nterms=N)
stats_text += _dict_to_text(top_freq)
return stats_text, frequency_dict, top_freq
[docs]def process_dir(dirname, extension="txt", N=10, case_sensitive=False):
"""
process_dir(dirname, extension, N=10, case_sensitive=False)
Processes all files in the given directory, and calls process_file on each
of them. It then returns a report along with the data used to construct
this.
Inputs:
dirname, str:
Directory to be processed
extension, str:
File extension to process in the directory.
N, int:
How many top frequencies should be calculated.
case_sensitive, bool:
Whether processing should be case sensitive or not, i.e.
if 'the' is the same as 'The' for counting word frequencies.
Outputs:
str:
Text report detailing the word frequencies for displaying.
list of dicts:
The full word frequency dicts for each file.
list of dicts:
The reduced top frequency dicts with N entries.
dict:
The combined frequency dict across all files.
dict:
The top N word frequencies across all files.
Example:
>>> f1 = open('test1.txt', 'w')
>>> f1.write("The quick brown fox jumped over the lazy dog.")
>>> f1.close()
>>> f2 = open('test2.txt', 'w')
>>> f2.write("This is a second file, the most common word will "
"still be the word the")
>>> f2.close()
>>> text, freq_dicts, combined = FolderAnalyse.process.process_dir(".")
>>> print(combined['the'])
5
"""
files = glob.glob(os.path.join(dirname, f'*.{extension}'))
if not len(files):
raise FileNotFoundError("No Files!")
stats_text = underline(f"Directory \"{dirname}\" Top {N} Word Frequencies")
dicts = []
top_dicts = []
for file in files:
stat, dic, top_dict = process_file(file, N, case_sensitive)
stats_text += stat
dicts.append(dic)
top_dicts.append(top_dict)
combined_dict = fp.sort_dict(fp.combine_dicts(dicts))
top_combined = top_frequencies(combined_dict, N)
stats_text += underline(f"All Files in {dirname} Top {N} Word Frequencies")
stats_text += _dict_to_text(top_combined)
return stats_text, dicts, top_dicts, combined_dict, top_combined