Correlation Matrix
Compute correlation between features using various methods (Pearson, Spearman, Kendall, Phi K, Cramers V, or Auto). Supports pandas, polars DataFrames, and Arrow tables.
Correlation Matrix
Processing
Computes the correlation matrix between features present in the input dataset using a selected statistical method. It supports standard methods like Pearson, Spearman, and Kendall, as well as association measures like Phi K and Cramers V for categorical and mixed data types.
Inputs
- data
- The input dataset (DataFrame or Arrow Table) containing the features for which correlation must be computed.
Inputs Types
| Input | Types |
|---|---|
data |
DataFrame, ArrowTable |
You can check the list of supported types here: Available Type Hints.
Outputs
- matrix
- The square DataFrame representing the computed correlation coefficients between all features.
Outputs Types
| Output | Types |
|---|---|
matrix |
DataFrame |
You can check the list of supported types here: Available Type Hints.
Options
The Correlation Matrix brick contains some changeable options:
- Correlation Method
- Selects the statistical method used to compute correlation. Choices include
pearson,spearman,kendall(typically for continuous data),phi_k(for mixed data types), orcramers(for association between categorical variables). (Default: pearson) - Verbose Output
- If enabled, detailed logging messages regarding the conversion and computation process are displayed. (Default: True)
import logging
import pandas as pd
import polars as pl
import pyarrow as pa
from coded_flows.types import DataFrame, ArrowTable, Union
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def compute_correlation_matrix(
data: Union[DataFrame, ArrowTable], options=None
) -> DataFrame:
brick_display_name = "Correlation Matrix"
options = options or {}
verbose = options.get("verbose", True)
correlation_type = options.get("correlation_type", "auto")
verbose and logger.info(
f"[{brick_display_name}] Starting correlation computation with method: '{correlation_type}'"
)
matrix = None
try:
if hasattr(data, "to_pandas") and isinstance(data, pl.Dataframe):
verbose and logger.info(
f"[{brick_display_name}] Converting polars DataFrame to pandas"
)
data = data.to_pandas()
elif hasattr(data, "to_pandas") and isinstance(data, pa.Table):
verbose and logger.info(
f"[{brick_display_name}] Converting Arrow table to pandas"
)
data = data.to_pandas()
elif isinstance(data, pd.DataFrame):
verbose and logger.info(
f"[{brick_display_name}] Input is already pandas DataFrame"
)
else:
error_msg = f"Unsupported data type: {type(data).__name__}"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise ValueError(error_msg)
except Exception as e:
error_msg = f"Failed to convert input data to pandas DataFrame: {e}"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise RuntimeError(error_msg) from e
if data.empty:
error_msg = "Input DataFrame is empty"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise ValueError(error_msg)
verbose and logger.info(
f"[{brick_display_name}] Processing DataFrame with {data.shape[0]:,} rows × {data.shape[1]:,} columns"
)
try:
if correlation_type == "pearson":
verbose and logger.info(
f"[{brick_display_name}] Computing Pearson correlation coefficient"
)
matrix = data.corr(method="pearson", numeric_only=True)
elif correlation_type == "spearman":
verbose and logger.info(
f"[{brick_display_name}] Computing Spearman rank correlation coefficient"
)
matrix = data.corr(method="spearman", numeric_only=True)
elif correlation_type == "kendall":
verbose and logger.info(
f"[{brick_display_name}] Computing Kendall rank correlation coefficient"
)
matrix = data.corr(method="kendall", numeric_only=True)
elif correlation_type == "phi_k":
verbose and logger.info(
f"[{brick_display_name}] Computing Phi K correlation for mixed-type variables"
)
try:
import phik
matrix = data.phik_matrix()
except ImportError as ie:
error_msg = "phik library is required for phi_k correlation. Install with: pip install phik"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise RuntimeError(error_msg) from ie
elif correlation_type == "cramers":
verbose and logger.info(
f"[{brick_display_name}] Computing Cramers V association coefficient"
)
try:
from scipy.stats import chi2_contingency
import numpy as np
columns = data.columns.tolist()
n = len(columns)
cramers_matrix = np.zeros((n, n))
for i, col1 in enumerate(columns):
for j, col2 in enumerate(columns):
if i == j:
cramers_matrix[i, j] = 1.0
elif i < j:
contingency = pd.crosstab(data[col1], data[col2])
(chi2, _, _, _) = chi2_contingency(contingency)
n_obs = contingency.sum().sum()
min_dim = (
min(contingency.shape[0], contingency.shape[1]) - 1
)
cramers_v = (
np.sqrt(chi2 / (n_obs * min_dim)) if min_dim > 0 else 0
)
cramers_matrix[i, j] = cramers_v
cramers_matrix[j, i] = cramers_v
matrix = pd.DataFrame(cramers_matrix, index=columns, columns=columns)
except ImportError as ie:
error_msg = "scipy library is required for cramers correlation"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise RuntimeError(error_msg) from ie
else:
error_msg = f"Invalid correlation type: '{correlation_type}'"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise ValueError(error_msg)
except (ValueError, RuntimeError):
raise
except Exception as e:
error_msg = f"Failed to compute {correlation_type} correlation: {e}"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise RuntimeError(error_msg) from e
if matrix is None or matrix.empty:
error_msg = "Correlation computation returned empty result"
verbose and logger.error(f"[{brick_display_name}] {error_msg}")
raise RuntimeError(error_msg)
verbose and logger.info(
f"[{brick_display_name}] Successfully computed correlation matrix: {matrix.shape[0]:,} × {matrix.shape[1]:,}"
)
matrix.reset_index(inplace=True)
return matrix
Brick Info
- polars[pyarrow]
- pandas
- phik
- scipy
- pyarrow