Source code for summarease.summarize

import pandas as pd
from fpdf import FPDF
from pathlib import Path
from summarease.summarize_numeric import summarize_numeric
from summarease.summarize_target import summarize_target_df, summarize_target_balance_plot
from summarease.summarize_dtypes import summarize_dtypes_table
from PIL import Image



[docs]
def validate_or_create_path(path):
    """
    Validate if the provided path is a valid `Path` object and create necessary directories.

    If the path represents a file, the function ensures that its parent directory exists.
    If the path represents a directory, the function ensures it exists, creating it if necessary.

    Parameters
    ----------
    path : Path
        The path to validate or create. Can represent a file or a directory.

    Raises
    ------
    TypeError
        If the provided `path` is not an instance of `Path`.

    Notes
    -----
    - If `path` is a directory and it does not exist, the function creates it, including 
      any necessary parent directories.
    """
    if not isinstance(path, Path):
        raise TypeError(f"Expected a Path object, got {type(path)}.")

    # Ensure the directory exists
    if not path.is_file():
        path.mkdir(parents=True, exist_ok=True)

            



[docs]
def add_image(pdf, image_path, pdf_height, pdf_width, element_padding=15):
    """
    Adds an image to a PDF document at the current y-position with consideration for page size 
    and scaling. If the image height exceeds the remaining space on the current page, a new page 
    is added to the PDF. The image is scaled proportionally to fit the page width while maintaining 
    the aspect ratio.

    Args:
        pdf: A FPDF object representing the PDF document to which the image will be added.
        image_path (str or Path): The file path to the image to be added. It supports various image 
                                  formats such as .jpg, .jpeg, .png, .gif, .bmp, .tiff, and .webp.
        pdf_height (float): The total height of the PDF page in units consistent with the FPDF settings.
        pdf_width (float): The total width of the PDF page in units consistent with the FPDF settings.
        element_padding (int, optional): The padding (in units consistent with FPDF) to be applied between 
                                          the image and the page's top margin. Default is 15.

    Returns:
        pdf: The updated FPDF object with the image added at the correct position.

    Notes:
        - The function checks if the image file exists and has a valid image extension.
        - The image is scaled to fit within the page width, and if necessary, a new page is added.
        - The function assumes a DPI of 96 for the image size conversion from pixels to millimeters.
        - If the image height exceeds the remaining space on the current page, a new page is created before adding the image.
    """
    assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {type(pdf)}."
    assert isinstance(image_path, Path) or isinstance(image_path, str), f"Argument 'image_path' should be a Path class or string. You have {type(image_path)}."
    assert isinstance(pdf_height, int) or isinstance(pdf_height, float), f"Argument 'pdf_height' should be an integer or float. You have {type(pdf_height)}."
    assert isinstance(pdf_width, int) or isinstance(pdf_height, float), f"Argument 'pdf_width' should be an integer or float. You have {type(pdf_width)}."
    assert isinstance(element_padding, int), f"Argument 'element_padding' should be an integer. You have {type(element_padding)}."

    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    image_path = Path(image_path)
    assert image_path.suffix in image_extensions, f"Unsupported image format. Should be {image_extensions}"
    image_path_str = str(image_path)
    y_position = pdf.get_y()  
    page_height = pdf_height - 2 * pdf.t_margin

    if not image_path.is_file():
        raise ValueError(f"File not found: {image_path_str}")

    if image_path.is_file():
        # Check if the file has a valid image extension
        if image_path.suffix.lower() in image_extensions:
            with Image.open(image_path_str) as img:
                _, image_height = img.size
                dpi = 96  
                element_height_mm = image_height / dpi * 25.4

                if element_height_mm > page_height:
                    scale_factor = page_height / element_height_mm
                else:
                    scale_factor = 1
    if y_position + element_height_mm > page_height:
        pdf.add_page()

    pdf.ln(pdf.get_y()) 
    y_position = pdf.get_y()
    # Add the image to the PDF
    pdf.image(image_path_str, x=pdf.l_margin, y=y_position + element_padding, w=int(scale_factor*(pdf_width - 2 * pdf.l_margin))) 
    pdf.ln(element_height_mm + element_padding) 

    # Manually update y_position after adding the image
    y_position = pdf.get_y()

    return pdf



[docs]
def add_table(pdf, table, pdf_height, pdf_width, element_padding=15):
    """
    Adds a table to the PDF document with the provided data, scaling the column widths to fit 
    within the page width while maintaining their relative proportions. The first row (header) 
    has a gray background, and the first column (index) is highlighted with a gray background.

    Args:
        pdf: A FPDF object representing the PDF document to which the table will be added.
        table (pandas.DataFrame): The table containing the data to be added. The first column 
                                  (index) will be inserted as a new column in the table.
        pdf_height (float): The total height of the PDF page in units consistent with the FPDF settings.
        pdf_width (float): The total width of the PDF page in units consistent with the FPDF settings.
        element_padding (int, optional): The padding (in units consistent with FPDF) to be applied 
                                          around the table. Default is 15.

    Returns:
        pdf: The updated FPDF object with the table added.

    Notes:
        - The function calculates the maximum column width based on the longest entry or column name, 
          scaling the column widths to fit the available page width while maintaining relative proportions.
        - The first row (header) is filled with a light gray background, and the first column (index) 
          is also highlighted with a gray background for better readability.
        - Column names are truncated if they are too long to fit in the cell, and the font size is adjusted 
          accordingly for long column names.
        - Numeric values are rounded to 2 decimal places for consistency.
    """
    assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {type(pdf)}."
    assert isinstance(table, pd.DataFrame), f"Argument 'table' should be a pandas Dataframe. You have {type(table)}."
    assert isinstance(pdf_height, int) or isinstance(pdf_height, float), f"Argument 'pdf_height' should be an integer or float. You have {type(pdf_height)}."
    assert isinstance(pdf_width, int) or isinstance(pdf_height, float), f"Argument 'pdf_width' should be an integer or float. You have {type(pdf_width)}."
    assert isinstance(element_padding, int), f"Argument 'element_padding' should be an integer. You have {type(element_padding)}."
    assert not table.empty, f"The table shouldn't be empty"

    pdf.set_font('Arial', '', 9)
    
    # Insert index as a new column (at the start)
    table.insert(0, 'Index', table.index)

    # Calculate maximum column width based on the longest entry
    col_widths = []
    for col in table.columns:
        max_length = max(table[col].apply(lambda x: len(str(x))).max(), len(col)) 
        col_widths.append(max_length * 2)  

    # Adjust the index column width to be smaller (as index is usually smaller)
    col_widths[0] = max(col_widths[0], 20)  

    total_width = sum(col_widths)
    
    # Scale column widths to fit within the page width, maintaining the relative proportions
    scale_factor = (pdf_width - 2 * element_padding) / total_width
    col_widths = [w * scale_factor for w in col_widths]

    # Set gray color for the first row and first column
    pdf.set_fill_color(230, 230, 230)  

    # Add table header with gray background for the first row
    for i, col in enumerate(table.columns):
        col_name = col
        # Convert col_widths[i] to integer for proper slicing
        max_length_for_col = int(col_widths[i] // 2)   
        # If the column name is longer than the cell, truncate or wrap the text
        if len(col_name) > max_length_for_col:  
            col_name = col_name[:max_length_for_col] + '...'  
            pdf.set_font('Arial', '', 8)  
        pdf.cell(col_widths[i], 10, col_name, border=1, align='C', fill=True)
        pdf.set_font('Arial', '', 9)  
    pdf.ln()

    # Add table rows with gray background for the first column (index)
    for i in range(len(table)):
        for j, col in enumerate(table.columns):
            value = table[col].iloc[i]
            # Round numeric values to 2 decimals
            if isinstance(value, (int, float)):
                value = round(value, 2)
            
            # Apply gray background for the first column (index)
            if j == 0:  
                pdf.set_fill_color(230, 230, 230)  
                pdf.cell(col_widths[j], 10, str(value), border=1, align='C', fill=True)
            else:
                pdf.cell(col_widths[j], 10, str(value), border=1, align='C', fill=False)
        pdf.ln()

    return pdf



[docs]
def switch_page_if_needed(pdf):
    assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {pdf}"
    if pdf.get_y() > 50:
        pdf.add_page()
    return pdf



[docs]
def summarize(dataset: pd.DataFrame,
              dataset_name: str = "Dataset Summary", 
              description: str = "Dataset summary generated by summarease.", 
              summarize_by: str = "plot", 
              target_variable: str = None,
              target_type: str = "categorical",
              output_file: str = "summary.pdf",
              output_dir: str = "./summarease_summary/"
):
    """
    Summarizes the given dataset by generating various statistics, visualizations, 
    and/or tables based on the provided options.

    Parameters:
    -----------
    dataset : pd.DataFrame
        The dataframe to be summarized.

    dataset_name : str, optional, default="Dataset Summary"
        Represents the title of the summary, can be simply the name of the dataset.

    description : str, optional, default="Dataset summary generated by summarease."
        A description of the dataset to provide context in the summary.

    summarize_by : str, optional, default="mix"
        Specifies what visual elements to use when summarizing the dataset:
        - "table" : Summarize using tables.
        - "plot" : Summarize using plots.

    target_variable : str, optional, default=None
        The name of the target variable in the dataset. This helps in identifying the dependent variable for further analysis.

    target_type : str, within {"categorical", "numerical"}
        The type of target variable.

    output_file : str, optional, default="summary.pdf"
        The name of the output file where the summary will be saved.

    output_dir : str, optional, default="./summarease_summary/"
        The directory where the output summary file will be saved.

    Returns:
    --------
    None
        This function outputs the summary of the dataset in an output file, including statistical summaries, visualizations, and cleaning steps (if applicable).

    Notes:
    ------
    - The `show_observations` parameter can be customized to display a certain number of observations.
    - The `summarize_by` parameter offers flexibility in the type of summary (table or plot).

    Example:
    --------
    >>> import pandas as pd
    >>> from summarease import summarize
    >>> data = pd.DataFrame({
    ...     "Age": [23, 45, 31, 35, 29],
    ...     "Gender": ["Male", "Female", "Female", "Male", "Male"],
    ...     "Salary": [50000, 60000, 75000, 80000, 65000]
    ... })
    >>> summarize(
    ...     dataset=data, 
    ...     dataset_name="Employee Data Summary", 
    ...     description="Summary of employee demographic and salary data.",
    ...     summarize_by="plot",
    ...     output_file="employee_summary.pdf"
    ... )
    # This will generate a summary of the `data` dataframe
    # and save the summary as 'employee_summary.pdf' in the default output directory.
    """
    assert isinstance(dataset, pd.DataFrame), f"Argument 'dataset' should be pandas dataframe (pd.DataFrame)! You have {type(dataset)}."
    assert isinstance(dataset_name, str), f"Argument 'dataset_name' should be string (str)! You have {type(dataset_name)}."
    assert isinstance(description, str), f"Argument 'description' should be string (str)! You have {type(description)}."
    assert isinstance(summarize_by, str), f"Argument 'summarize_by' should be a string (str)! You have {type(summarize_by)}."
    if target_variable is not None:
        assert isinstance(target_variable, str), f"Argument 'target_variable' should be a string (str)! You have {type(target_variable)}."
        assert isinstance(target_type, str), f"Argument 'target_type' should be a string (str)! You have {type(target_type)}."
    assert isinstance(output_file, str), f"Argument 'output_file' should be a string (str)! You have {type(output_file)}."
    assert isinstance(output_dir, str), f"Argument 'output_dir' should be a string (str)! You have {type(output_dir)}."

    summarize_by = summarize_by.lower()
    assert summarize_by in {"table", "plot", "mix"}, f"Argument 'summarize_by' should be one of the following options: [table, plot, mix]! You have {summarize_by}."

    output_dir = Path(output_dir)
    output_path = output_dir / output_file

    assert (output_path.suffix == ".pdf") or (output_path.suffix == ""), f"The 'output_file' should either have a .pdf extension or no extension! You have {output_path.suffix}."

    # If the path doesn't exist, create it
    validate_or_create_path(output_dir)

    if summarize_by in {"plot", "mix"}:
        plot_output_path = output_dir / "img"
        validate_or_create_path(plot_output_path)


    dataset_shape = dataset.shape
    assert (dataset_shape[1] >= 2 and dataset_shape[1] <= 15), f"The function currently supports dataframes having less than 15 columns and more than 2 columns! You have {dataset_shape[1]}"

    # Create the PDF
    pdf = FPDF()

    # Add a new page
    pdf.add_page()

    page_width = pdf.w
    page_height = pdf.h

    element_padding = 10
    text_line_padding = 10

    # Set the font to Helvetica, set the size, write the title
    pdf.set_font("Helvetica", size=15)
    pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt=dataset_name, ln=True, align='C')

    # Change the size for the description and write it
    pdf.set_font("Helvetica", size=11)
    pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=description, align='L')

    pdf = switch_page_if_needed(pdf)
    pdf.set_font("Helvetica", size=13)
    pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Numeric Columns Summary", ln=True, align='C')

    if summarize_by == "plot":
        summarized_numeric_output = summarize_numeric(dataset, summarize_by="plot")
        if summarized_numeric_output:
            for key, item in summarized_numeric_output.items():
                plot_file = plot_output_path / f'{key}.png'
                str_plot_file = str(plot_file)
                item.save(plot_file)
                pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=10)

        if target_variable is not None:
            pdf = switch_page_if_needed(pdf)
            pdf.set_font("Helvetica", size=13)
            pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Target Variable Summary", ln=True, align='C')
            pdf.set_font("Helvetica", size=11)
            pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
            summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
            summarized_target_plot = summarize_target_balance_plot(summarized_target_output)
            target_plot_file = plot_output_path / "target_plot.png"
            summarized_target_plot.save(target_plot_file)
            pdf = add_image(pdf, target_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=0)

    elif summarize_by == "table":
        summarized_numeric_output = summarize_numeric(dataset, summarize_by="table")
        if summarized_numeric_output:
            pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15)

        if target_variable is not None:
            pdf = switch_page_if_needed(pdf)
            summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
            pdf.set_font("Helvetica", size=13)
            pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Target Variable Summary", ln=True, align='C')
            pdf.set_font("Helvetica", size=11)
            pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
            pdf = add_table(pdf, table = summarized_target_output, pdf_height=page_height, pdf_width=page_width, element_padding=15)

    summarized_dtypes_table = summarize_dtypes_table(dataset)
    pdf.set_font("Helvetica", size=13)
    pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Dataset Data Types Summary", ln=True, align='C')
    pdf = add_table(pdf, table = summarized_dtypes_table, pdf_height=page_height, pdf_width=page_width, element_padding=15)

    pdf.output(output_path)
    assert output_path.exists(), "Something went wrong... The PDF output was not saved."
    print("PDF created!")