Data Profiling Report
Generate a comprehensive HTML data profiling report from a DataFrame.
Data Profiling Report
Processing
This function generates a comprehensive HTML data profiling report for the input data using the ydata-profiling library. It also supports optional time series mode and returns a summary DataFrame listing all generated alerts.
Inputs
- data
- The input tabular data (DataFrame, Arrow Table, or Polars DataFrame) to be analyzed.
- folder path
- The directory path where the generated HTML report file will be saved.
Inputs Types
| Input | Types |
|---|---|
data |
DataFrame, ArrowTable |
folder path |
DirectoryPath, Str |
You can check the list of supported types here: Available Type Hints.
Outputs
- alerts
- A DataFrame containing a list of structured alerts identified during the data profiling process (e.g., warnings about missing values or high correlation).
The alerts output contains the following metadata structure:
Outputs Types
| Output | Types |
|---|---|
alerts |
DataFrame |
You can check the list of supported types here: Available Type Hints.
- Alerts: A string description of an alert identified by the profiler (e.g., highly correlated columns, high number of missing values).
Options
The Data Profiling Report brick contains some changeable options:
- Report Title
- Sets the title used in the HTML report and the basis for the generated filename.
- Minimal Mode
- If enabled, runs the profiling in minimal configuration, reducing memory usage and time at the cost of some detailed analysis.
- Time Series Mode
- If enabled, activates time series analysis features, requiring a date column to be specified for sorting.
- TS Date Column
- Specifies the column name used for sorting the data when Time Series Mode is active.
- Verbose
- If enabled, provides detailed logging output during the execution of the brick.
import logging
import os
import pandas as pd
import polars as pl
import pyarrow as pa
from slugify import slugify
from ydata_profiling import ProfileReport
from coded_flows.types import DataFrame, Union, ArrowTable, DirectoryPath, Str
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def generate_data_profile(
data: Union[DataFrame, ArrowTable],
folder_path: Union[DirectoryPath, Str],
options=None,
) -> DataFrame:
brick_display_name = "Data Profiling Report"
options = options or {}
verbose = options.get("verbose", True)
report_title = options.get("report_title", "Data Profiling Report")
minimal_mode = options.get("minimal", False)
is_time_series = options.get("is_time_series", False)
sortby_col = options.get("sortby_col", "")
alerts = pd.DataFrame({"Alerts": []})
folder_path = DirectoryPath(folder_path)
folder_path.mkdir(parents=True, exist_ok=True)
if not report_title:
report_title = "Data Profiling Report"
try:
verbose and logger.info(
f"[{brick_display_name}] Starting data profiling process."
)
if isinstance(data, pl.DataFrame):
data = data.to_pandas()
elif isinstance(data, pa.Table):
data = data.to_pandas()
elif not isinstance(data, pd.DataFrame):
error_msg = (
f"Input data is not a pandas DataFrame. Received type: {type(data)}"
)
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise TypeError(error_msg)
if data.empty:
error_msg = "Input DataFrame is empty."
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise ValueError(error_msg)
verbose and logger.info(
f"[{brick_display_name}] Validated DataFrame: {data.shape[0]} rows, {data.shape[1]} columns."
)
if is_time_series and sortby_col:
if sortby_col not in data.columns:
verbose and logger.warning(
f"[{brick_display_name}] Column '{sortby_col}' not found. Skipping Time Series sort."
)
else:
col_type = data[sortby_col].dtype
is_date_type = pd.api.types.is_datetime64_any_dtype(col_type)
if not is_date_type:
verbose and logger.info(
f"[{brick_display_name}] Attempting to convert '{sortby_col}' to datetime."
)
try:
data[sortby_col] = pd.to_datetime(data[sortby_col])
is_date_type = True
except Exception as e:
verbose and logger.warning(
f"[{brick_display_name}] Could not convert '{sortby_col}' to datetime: {e}. Skipping TS sort."
)
if is_date_type:
if data[sortby_col].isna().all():
verbose and logger.warning(
f"[{brick_display_name}] Column '{sortby_col}' is entirely empty. Skipping TS sort."
)
else:
verbose and logger.info(
f"[{brick_display_name}] Sorting data by time column: '{sortby_col}'."
)
elif is_time_series and (not sortby_col):
verbose and logger.warning(
f"[{brick_display_name}] Time Series mode active but no column specified. Skipping sort."
)
verbose and logger.info(
f"[{brick_display_name}] Initializing ProfileReport (Minimal: {minimal_mode}, TS: {is_time_series})."
)
profile = ProfileReport(
data,
title=report_title,
minimal=minimal_mode,
tsmode=is_time_series,
explorative=True,
sortby=sortby_col if is_time_series else None,
progress_bar=False,
correlations={
"auto": {"calculate": True},
"pearson": {"calculate": True},
"spearman": {"calculate": True},
"kendall": {"calculate": True},
"phi_k": {"calculate": True},
"cramers": {"calculate": True},
},
)
report_file_path = folder_path / f"{slugify(report_title)}.html"
verbose and logger.info(
f"[{brick_display_name}] Saving HTML report to '{folder_path}'."
)
profile.to_file(report_file_path)
verbose and logger.info(f"[{brick_display_name}] Extracting analysis metadata.")
description = profile.get_description()
alerts = pd.DataFrame({"Alerts": [f"{alert}" for alert in description.alerts]})
verbose and logger.info(
f"[{brick_display_name}] Profiling completed successfully."
)
except Exception as e:
error_msg = f"Data profiling failed: {str(e)}"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise RuntimeError(error_msg) from e
return alerts
Brick Info
version
v0.1.0
python
3.10,
3.11,
3.12,
3.13
requirements
- polars[pyarrow]
- ydata-profiling
- pandas
- python-slugify
- pyarrow