Correlation Matrix

Compute correlation between features using various methods (Pearson, Spearman, Kendall, Phi K, Cramers V, or Auto). Supports pandas, polars DataFrames, and Arrow tables.

Correlation Matrix

Processing

Computes the correlation matrix between features present in the input dataset using a selected statistical method. It supports standard methods like Pearson, Spearman, and Kendall, as well as association measures like Phi K and Cramers V for categorical and mixed data types.

Inputs

data
The input dataset (DataFrame or Arrow Table) containing the features for which correlation must be computed.

Inputs Types

Input Types
data DataFrame, ArrowTable

You can check the list of supported types here: Available Type Hints.

Outputs

matrix
The square DataFrame representing the computed correlation coefficients between all features.

Outputs Types

Output Types
matrix DataFrame

You can check the list of supported types here: Available Type Hints.

Options

The Correlation Matrix brick contains some changeable options:

Correlation Method
Selects the statistical method used to compute correlation. Choices include pearson, spearman, kendall (typically for continuous data), phi_k (for mixed data types), or cramers (for association between categorical variables). (Default: pearson)
Verbose Output
If enabled, detailed logging messages regarding the conversion and computation process are displayed. (Default: True)
import logging
import pandas as pd
import polars as pl
import pyarrow as pa
from coded_flows.types import DataFrame, ArrowTable, Union

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def compute_correlation_matrix(
    data: Union[DataFrame, ArrowTable], options=None
) -> DataFrame:
    brick_display_name = "Correlation Matrix"
    options = options or {}
    verbose = options.get("verbose", True)
    correlation_type = options.get("correlation_type", "auto")
    verbose and logger.info(
        f"[{brick_display_name}] Starting correlation computation with method: '{correlation_type}'"
    )
    matrix = None
    try:
        if hasattr(data, "to_pandas") and isinstance(data, pl.Dataframe):
            verbose and logger.info(
                f"[{brick_display_name}] Converting polars DataFrame to pandas"
            )
            data = data.to_pandas()
        elif hasattr(data, "to_pandas") and isinstance(data, pa.Table):
            verbose and logger.info(
                f"[{brick_display_name}] Converting Arrow table to pandas"
            )
            data = data.to_pandas()
        elif isinstance(data, pd.DataFrame):
            verbose and logger.info(
                f"[{brick_display_name}] Input is already pandas DataFrame"
            )
        else:
            error_msg = f"Unsupported data type: {type(data).__name__}"
            verbose and logger.error(f"[{brick_display_name}] {error_msg}")
            raise ValueError(error_msg)
    except Exception as e:
        error_msg = f"Failed to convert input data to pandas DataFrame: {e}"
        verbose and logger.error(f"[{brick_display_name}] {error_msg}")
        raise RuntimeError(error_msg) from e
    if data.empty:
        error_msg = "Input DataFrame is empty"
        verbose and logger.error(f"[{brick_display_name}] {error_msg}")
        raise ValueError(error_msg)
    verbose and logger.info(
        f"[{brick_display_name}] Processing DataFrame with {data.shape[0]:,} rows × {data.shape[1]:,} columns"
    )
    try:
        if correlation_type == "pearson":
            verbose and logger.info(
                f"[{brick_display_name}] Computing Pearson correlation coefficient"
            )
            matrix = data.corr(method="pearson", numeric_only=True)
        elif correlation_type == "spearman":
            verbose and logger.info(
                f"[{brick_display_name}] Computing Spearman rank correlation coefficient"
            )
            matrix = data.corr(method="spearman", numeric_only=True)
        elif correlation_type == "kendall":
            verbose and logger.info(
                f"[{brick_display_name}] Computing Kendall rank correlation coefficient"
            )
            matrix = data.corr(method="kendall", numeric_only=True)
        elif correlation_type == "phi_k":
            verbose and logger.info(
                f"[{brick_display_name}] Computing Phi K correlation for mixed-type variables"
            )
            try:
                import phik

                matrix = data.phik_matrix()
            except ImportError as ie:
                error_msg = "phik library is required for phi_k correlation. Install with: pip install phik"
                verbose and logger.error(f"[{brick_display_name}] {error_msg}")
                raise RuntimeError(error_msg) from ie
        elif correlation_type == "cramers":
            verbose and logger.info(
                f"[{brick_display_name}] Computing Cramers V association coefficient"
            )
            try:
                from scipy.stats import chi2_contingency
                import numpy as np

                columns = data.columns.tolist()
                n = len(columns)
                cramers_matrix = np.zeros((n, n))
                for i, col1 in enumerate(columns):
                    for j, col2 in enumerate(columns):
                        if i == j:
                            cramers_matrix[i, j] = 1.0
                        elif i < j:
                            contingency = pd.crosstab(data[col1], data[col2])
                            (chi2, _, _, _) = chi2_contingency(contingency)
                            n_obs = contingency.sum().sum()
                            min_dim = (
                                min(contingency.shape[0], contingency.shape[1]) - 1
                            )
                            cramers_v = (
                                np.sqrt(chi2 / (n_obs * min_dim)) if min_dim > 0 else 0
                            )
                            cramers_matrix[i, j] = cramers_v
                            cramers_matrix[j, i] = cramers_v
                matrix = pd.DataFrame(cramers_matrix, index=columns, columns=columns)
            except ImportError as ie:
                error_msg = "scipy library is required for cramers correlation"
                verbose and logger.error(f"[{brick_display_name}] {error_msg}")
                raise RuntimeError(error_msg) from ie
        else:
            error_msg = f"Invalid correlation type: '{correlation_type}'"
            verbose and logger.error(f"[{brick_display_name}] {error_msg}")
            raise ValueError(error_msg)
    except (ValueError, RuntimeError):
        raise
    except Exception as e:
        error_msg = f"Failed to compute {correlation_type} correlation: {e}"
        verbose and logger.error(f"[{brick_display_name}] {error_msg}")
        raise RuntimeError(error_msg) from e
    if matrix is None or matrix.empty:
        error_msg = "Correlation computation returned empty result"
        verbose and logger.error(f"[{brick_display_name}] {error_msg}")
        raise RuntimeError(error_msg)
    verbose and logger.info(
        f"[{brick_display_name}] Successfully computed correlation matrix: {matrix.shape[0]:,} × {matrix.shape[1]:,}"
    )
    matrix.reset_index(inplace=True)
    return matrix

Brick Info

version v0.1.0
python 3.10, 3.11, 3.12, 3.13
requirements
  • polars[pyarrow]
  • pandas
  • phik
  • scipy
  • pyarrow