Source code for FolderAnalyse.process

"""
Ryan Pepper (2018)

process.py

Module containing functions that process word frequnecy dicts
into a report format.

"""
import glob
import os
from FolderAnalyse import fileparser as fp


[docs]def top_frequencies(freq_dict, nterms=10):
    """
    top_frequencies(freq_dict, name, nterms)

    Returns the first nterms in the dictionary.

    Input:
        freq_dict, dict:
            Dictionary of word frequencies.

        nterms, int;
            Number of word frequencies in returned dictionary.

    Output:
        dict:
            The reduced size dictionary.


    Note:
    This wrapper is needed just to handle
    files with less than 10 words.
    """
    items = list(freq_dict.items())
    if len(items) < nterms:
        return dict(items)
    else:
        return dict(items[:nterms])


def _dict_to_text(freq_dict):
    """
    _dict_to_text(freq_dict):

    Internal routine to print items and values
    in a sorted dictionary, to avoid duplicating
    this code in process_file and in process_dir.

    Inputs:
        freq_dict, dict
            Dictionary of words and frequencies.

    Outputs:
        str:
            Text block with numerated key-value pairs.
    """
    stats_text = ""
    for i, (key, value) in enumerate(freq_dict.items()):
        stats_text += f"{i+1}. {key}, {value}\n"
    stats_text += '\n'
    return stats_text


[docs]def underline(title):
    """
    underline(title)

    Returns title but with another line matching the
    length as in restructured text format.

    Inputs:
        title, str:
            Title to be underlined.

    Outputs:
        str:
            Multiline string with underlining.

    Example:

    >>> print(FolderAnalyse.process.underline('Hello'))
    Hello
    -----
    """
    return title + '\n' + '-'*(len(title)) + '\n'


[docs]def process_file(filename, N=10, case_sensitive=False):
    """
    process_file(filename, N=10, case_sensitive=False)

    Process a file and return some text giving the top
    N words in the file and the original frequency dictionary.

    Inputs:
        filename, str:
            File to be processed.
        N, int:
            Number of top frequencies to add to report
        case_sensitive, bool:
            Whether processing should be case sensitive or not, i.e.
            if 'the' is the same as 'The' for counting word frequencies.


    Outputs:
        str:
            Textual report about word frequency in files.
        dict:
            Total word frequency dict
        dict:
            Reduced wrod frequency dict with N terms.

    Example:
    >>> f = open('test.txt', 'w')
    >>> f.write("The quick brown fox jumped over the lazy dog.")
    >>> f.close()
    >>> text, freq_dict = FolderAnalyse.process.process_file("test.txt")
    >>> print(freq_dict['the'])
    2
    """
    stats_text = underline(f"File \"{filename}\" Top {N} Word Frequencies")

    frequency_dict = fp.parse(filename, case_sensitive=case_sensitive,
                              sort=True)

    top_freq = top_frequencies(frequency_dict, nterms=N)
    stats_text += _dict_to_text(top_freq)

    return stats_text, frequency_dict, top_freq


[docs]def process_dir(dirname, extension="txt", N=10, case_sensitive=False):
    """
    process_dir(dirname, extension, N=10, case_sensitive=False)

    Processes all files in the given directory, and calls process_file on each
    of them. It then returns a report along with the data used to construct
    this.

    Inputs:
        dirname, str:
            Directory to be processed
        extension, str:
            File extension to process in the directory.
        N, int:
            How many top frequencies should be calculated.
        case_sensitive, bool:
            Whether processing should be case sensitive or not, i.e.
            if 'the' is the same as 'The' for counting word frequencies.

    Outputs:
        str:
            Text report detailing the word frequencies for displaying.
        list of dicts:
            The full word frequency dicts for each file.
        list of dicts:
            The reduced top frequency dicts with N entries.
        dict:
            The combined frequency dict across all files.
        dict:
            The top N word frequencies across all files.

    Example:

    >>> f1 = open('test1.txt', 'w')
    >>> f1.write("The quick brown fox jumped over the lazy dog.")
    >>> f1.close()
    >>> f2 = open('test2.txt', 'w')
    >>> f2.write("This is a second file, the most common word will "
                 "still be the word the")
    >>> f2.close()

    >>> text, freq_dicts, combined = FolderAnalyse.process.process_dir(".")
    >>> print(combined['the'])
    5
    """
    files = glob.glob(os.path.join(dirname, f'*.{extension}'))
    if not len(files):
        raise FileNotFoundError("No Files!")

    stats_text = underline(f"Directory \"{dirname}\" Top {N} Word Frequencies")

    dicts = []
    top_dicts = []
    for file in files:
        stat, dic, top_dict = process_file(file, N, case_sensitive)
        stats_text += stat
        dicts.append(dic)
        top_dicts.append(top_dict)

    combined_dict = fp.sort_dict(fp.combine_dicts(dicts))
    top_combined = top_frequencies(combined_dict, N)
    stats_text += underline(f"All Files in {dirname} Top {N} Word Frequencies")
    stats_text += _dict_to_text(top_combined)

    return stats_text, dicts, top_dicts, combined_dict, top_combined