Split Target Column

Splits the data into Features (X) and Target (y). Returns X as a DataFrame and y as a Series.

Split Target Column

Processing

This brick prepares your dataset for machine learning by separating the column you want to predict (the Target) from the columns used to make that prediction (the Features).

It takes an input table and isolates the specific column you define as the target. The result is split into two parts:

Features (X): A table containing all columns except the target.
Target (y): A series (list of values) containing only the target column.

Inputs

data: The dataset you want to split. This should be a table containing both your feature columns and the target column you intend to predict.
target column (optional): The exact name of the column representing the value you want to predict (e.g., "price", "churn", "species").

Inputs Types

Input	Types
`data`	`DataFrame`, `ArrowTable`
`target column`	`Str`

You can check the list of supported types here: Available Type Hints.

Outputs

X: The Features dataset. This is a DataFrame containing every column from the original input except the specified target column.
y: The Target data. This is a Series containing only the values from the specified target column.

Outputs Types

Output	Types
`X`	`DataFrame`
`y`	`DataSeries`

You can check the list of supported types here: Available Type Hints.

Options

The Split Target Column brick contains some changeable options:

Target Column Name: The name of the column to extract as the target. If you provided a value in the target column input above, this option is ignored.
Output Format: Determines the library used for the output data structures. This helps ensure compatibility with the next steps in your workflow.

pandas: Returns a standard Pandas DataFrame and Series. Best for general compatibility with most Python libraries.
polars: Returns a Polars DataFrame and Series. Best for high-performance workflows handling large datasets.

Verbose: Controls how much information is logged during processing.

import logging
import duckdb
import pandas as pd
import polars as pl
import pyarrow as pa
from coded_flows.types import Union, DataFrame, ArrowTable, Tuple, DataSeries, Str
from coded_flows.utils import CodedFlowsLogger

logger = CodedFlowsLogger(name="Split Target Column", level=logging.INFO)


def _coalesce(*values):
    return next((v for v in values if v is not None))


def _sanitize_identifier(identifier):
    """
    Sanitize SQL identifier by escaping special characters.
    """
    return identifier.replace('"', '""')


def split_target_column(
    data: Union[DataFrame, ArrowTable], target_column: Str = None, options=None
) -> Tuple[DataFrame, DataSeries]:
    brick_display_name = "Split Target Column"
    options = options or {}
    verbose = options.get("verbose", True)
    target_col_name = _coalesce(target_column, options.get("target_column"))
    output_format = options.get("output_format", "pandas")
    X = None
    y = None
    if not target_col_name:
        verbose and logger.error(f"[{brick_display_name}] No target column specified.")
        raise ValueError("A target column name must be provided.")
    try:
        verbose and logger.info(
            f"[{brick_display_name}] Starting split operation. Target: '{target_col_name}'"
        )
        data_type = None
        if isinstance(data, pd.DataFrame):
            data_type = "pandas"
        elif isinstance(data, pl.DataFrame):
            data_type = "polars"
        elif isinstance(data, (pa.Table, pa.lib.Table)):
            data_type = "arrow"
        if data_type is None:
            raise ValueError(
                "Input data must be a pandas DataFrame, Polars DataFrame, or Arrow Table"
            )
        verbose and logger.info(
            f"[{brick_display_name}] Detected input format: {data_type}."
        )
        conn = duckdb.connect(":memory:")
        conn.register("input_table", data)
        column_info = conn.execute("DESCRIBE input_table").fetchall()
        all_columns = [col[0] for col in column_info]
        if target_col_name not in all_columns:
            conn.close()
            raise ValueError(f"Target column '{target_col_name}' not found in dataset.")
        sanitized_target = _sanitize_identifier(target_col_name)
        query_X = f'SELECT * EXCLUDE ("{sanitized_target}") FROM input_table'
        query_y = f'SELECT "{sanitized_target}" FROM input_table'
        verbose and logger.info(f"[{brick_display_name}] Executing split queries.")
        if output_format == "pandas":
            X = conn.execute(query_X).df()
            y_df = conn.execute(query_y).df()
            y = y_df.iloc[:, 0]
            verbose and logger.info(
                f"[{brick_display_name}] Converted result to pandas DataFrame (X) and Series (y)."
            )
        elif output_format == "polars":
            X = conn.execute(query_X).pl()
            y_df = conn.execute(query_y).pl()
            y = y_df.to_series()
            verbose and logger.info(
                f"[{brick_display_name}] Converted result to Polars DataFrame (X) and Series (y)."
            )
        else:
            conn.close()
            raise ValueError(
                f"Unsupported output format: {output_format}. Must be 'pandas' or 'polars'."
            )
        conn.close()
        verbose and logger.info(
            f"[{brick_display_name}] Split complete. X shape: {X.shape}, y shape: {y.shape}"
        )
    except Exception as e:
        verbose and logger.error(f"[{brick_display_name}] Error during split operation")
        raise
    return (X, y)

Brick Info

version v0.1.4

python 3.11, 3.12, 3.13

requirements

shap>=0.47.0
pandas
pyarrow
polars[pyarrow]
numba>=0.56.0
duckdb