Source code for sanafe.data

"""
Convert SANA-FE trace outputs into useful formats.

Each function accepts whatever type is most convenient:
    - a path or Path to a CSV file produced by chip.sim()
    - the dict returned by chip.sim()
    - the raw in-memory value held under the matching key
    - a DataFrame (returned as-is)
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Iterable, Sequence

import numpy as np
import pandas as pd


def _is_path(source: Any) -> bool:
    return isinstance(source, (str, Path))


def _maybe_unwrap(source: Any, key: str) -> Any:
    if isinstance(source, dict) and key in source and not _looks_like_perf_dict(source):
        return source[key]
    return source


def _looks_like_perf_dict(source: Any) -> bool:
    if not isinstance(source, dict):
        return False
    return "total_energy" in source and "timestep" in source and isinstance(
        source.get("timestep"), (list, tuple, np.ndarray))



[docs]
def spikes_to_raster(
    source: Any,
    groups: Optional[Sequence[str]] = None,
    time_range: Optional[Tuple[int, int]] = None,
    n_timesteps: Optional[int] = None,
) -> Tuple[np.ndarray, list[str], np.ndarray]:
    """Convert a spike trace into a dense 2D raster matrix.

    Args:
        source: The spike trace data. May be a :class:`pandas.DataFrame`, a
            path to a CSV file, the dict returned by ``chip.sim()``, or the raw
            ``spike_trace`` values i.e., a per-timestep list, where each element
            is a list of spiking neurons for that timestep.
        groups: Optional subset of neuron groups to include. Row order in the
            returned matrix follows the order given here; when omitted, all
            groups are included in sorted order.
        time_range (tuple(int, int), optional): Optional ``(start, stop)``
            half-open interval of timesteps to include. When omitted, spans from
            the minimum to maximum timestep present in the data.
        n_timesteps (int, optional): Optional explicit number of columns in the
            matrix. Useful when the trace ends before the simulation does (e.g.,
            no spikes in the final timesteps) and you want the matrix to span
            the full run. Ignored when ``time_range`` is given.

    Returns:
        tuple: A 3-tuple ``(matrix, neuron_ids, timesteps)`` where

        * ``matrix`` is a 2D boolean :class:`numpy.ndarray` of shape
          ``(n_neurons, n_timesteps)`` with ``True`` where a spike occurred.
        * ``neuron_ids`` is a list of ``"group.offset"`` strings labelling
          each row of the matrix.
        * ``timesteps`` is a 1D :class:`numpy.ndarray` giving the timestep
          index for each column.

    Raises:
        ValueError: If no spike trace data can be found in ``source``, or if
            ``groups`` contains unknown group names.
    """
    from sanafe.data import spikes_to_dataframe  # avoid circular import in snippet

    df = spikes_to_dataframe(source)

    all_groups = sorted(df["group"].unique())
    if groups is None:
        groups = all_groups
    else:
        unknown = set(groups) - set(all_groups)
        if unknown:
            raise ValueError(
                f"Unknown groups: {unknown}. Available: {all_groups}")
        df = df[df["group"].isin(groups)]

    # Determine time axis
    if time_range is not None:
        t_start, t_stop = time_range
        df = df[(df["timestep"] >= t_start) & (df["timestep"] < t_stop)]
    else:
        t_start = int(df["timestep"].min()) if len(df) else 0
        if n_timesteps is not None:
            t_stop = t_start + n_timesteps
        else:
            t_stop = int(df["timestep"].max()) + 1 if len(df) else t_start + 1

    timesteps = np.arange(t_start, t_stop)

    # Determine row order: stable per-group ordering by neuron_offset
    neuron_ids: list[str] = []
    row_of: dict[str, int] = {}
    for g in groups:
        offsets = sorted(df.loc[df["group"] == g, "neuron_offset"].unique())
        for off in offsets:
            nid = f"{g}.{int(off)}"
            row_of[nid] = len(neuron_ids)
            neuron_ids.append(nid)

    matrix = np.zeros((len(neuron_ids), len(timesteps)), dtype=bool)
    if len(df) and len(neuron_ids) and len(timesteps):
        rows = df["neuron_id"].map(row_of).to_numpy()
        cols = df["timestep"].to_numpy() - t_start
        valid = (rows >= 0) & (cols >= 0) & (cols < len(timesteps))
        matrix[rows[valid], cols[valid]] = True

    return matrix, neuron_ids, timesteps




[docs]
def spikes_to_dataframe(source: Any) -> pd.DataFrame:
    """Convert a spike trace into a pandas DataFrame.

    Args:
        source (pandas.DataFrame or Path or str or dict or list[list[NeuronAddress]]):
           The spike trace data. May be a :class:`pandas.DataFrame`, a path to a
           CSV file, the dict returned by ``chip.sim()``, or the raw
           ``spike_trace`` value. In memory, spike traces are a per-timestep
           list, with each entry a list of :class:`NeuronAddress` of neurons
           that fired that timestep.

    Returns:
        pandas.DataFrame: A DataFrame with columns ``timestep``, ``group``,
        ``neuron_offset``, and ``neuron_id``.

    Raises:
        ValueError: If no spike trace data can be found in ``source``.
    """
    if isinstance(source, pd.DataFrame):
        return source

    if _is_path(source):
        df = pd.read_csv(source, dtype={"neuron": str})
        split = df["neuron"].str.rsplit(".", n=1, expand=True)
        df["group"] = split[0]
        df["neuron_offset"] = split[1].astype(int)
        df["neuron_id"] = df["neuron"]
        return df[["timestep", "group", "neuron_offset", "neuron_id"]]

    spikes = _maybe_unwrap(source, "spike_trace")
    if spikes is None:
        raise ValueError("No spike trace data in source")

    records = []
    for t, row in enumerate(spikes):
        for s in row:
            group = getattr(s, "group_name", None) or s["group_name"]
            offset = getattr(s, "neuron_offset", None)
            if offset is None:
                offset = s["neuron_offset"]
            records.append({
                "timestep": t,
                "group": group,
                "neuron_offset": offset,
                "neuron_id": f"{group}.{offset}",
            })
    return pd.DataFrame(records, columns=["timestep", "group",
                                          "neuron_offset", "neuron_id"])




[docs]
def potentials_to_dataframe(source: Any,
                            neuron_ids: Sequence[str] | None = None) -> pd.DataFrame:
    """Convert a membrane-potential trace into a pandas DataFrame indexed by timestep.

    Args:
        source (pandas.DataFrame or str or Path or dict or list[list[float]]):
            The potential trace data. May be a :class:`pandas.DataFrame`, a
            path to a CSV file, the dict returned by ``chip.sim()``, or a 2D
            array of ``potential_trace`` values (i.e., a per-timestep list,
            containing per-neuron lists of potentials).
        neuron_ids (List[str]): Optional column labels for the neurons. When
            omitted, columns are named ``Neuron 0``, ``Neuron 1``, and so on.
            Only used when ``source`` is in-memory trace data.

    Returns:
        pandas.DataFrame: A DataFrame whose index is named ``timestep`` and
        whose columns correspond to individual neurons.

    Raises:
        ValueError: If no potential trace data is found, or if ``neuron_ids``
            length does not match the number of trace columns.
    """
    if isinstance(source, pd.DataFrame):
        df = source.copy()
        if "timestep" in df.columns:
            df = df.set_index("timestep")
        return df

    if _is_path(source):
        df = pd.read_csv(source)
        if "timestep" in df.columns:
            df = df.set_index("timestep")
        df.columns = [c.replace("neuron ", "", 1) for c in df.columns]
        return df

    potentials = _maybe_unwrap(source, "potential_trace")
    if potentials is None or len(potentials) == 0:
        raise ValueError("No potential trace data in source")

    arr = np.asarray(potentials, dtype=float)
    n_neurons = arr.shape[1] if arr.ndim == 2 else 0
    if neuron_ids is None:
        neuron_ids = [f"Neuron {i}" for i in range(n_neurons)]
    elif len(neuron_ids) != n_neurons:
        raise ValueError(
            f"neuron_ids has {len(neuron_ids)} entries but trace has {n_neurons} columns")

    df = pd.DataFrame(arr, columns=list(neuron_ids))
    df.index.name = "timestep"
    return df



def neuron_traces_to_dataframe(source: Any,
                              neuron_ids: Sequence[str] | None = None) -> pd.DataFrame:
    """Convert additional neuron traces into a pandas DataFrame indexed by timestep.

    Args:
        source (pandas.DataFrame or str or Path or dict or dict[str, list[list]]):
            The neuron trace data. May be a :class:`pandas.DataFrame`, a
            path to a CSV file, the dict returned by ``chip.sim()``, or a 2D
            array of neuron_traces (i.e., a per-timestep list, containing
            per-neuron lists of values).
        neuron_ids (List[str]): Optional column labels for the neurons. When
            omitted, columns are named ``Neuron 0``, ``Neuron 1``, and so on.
            Only used when ``source`` is in-memory trace data.

    Returns:
        pandas.DataFrame: A DataFrame whose index is named ``timestep`` and
        whose columns correspond to individual neurons.

    Raises:
        ValueError: If no potential trace data is found, or if ``neuron_ids``
            length does not match the number of trace columns.
    """
    if isinstance(source, pd.DataFrame):
        df = source.copy()
        index_cols = [c for c in ("timestep", "neuron_id") if c in df.columns]
        if index_cols:
            df = df.set_index(index_cols)
        return df

    if _is_path(source):
        # Drop pandas' phantom column from the trailing-comma rows.
        raw = pd.read_csv(source)


        if "timestep" not in raw.columns:
            raise ValueError(
                f"Neuron trace CSV {source!r} is missing a 'timestep' column")

        long_rows: list[dict] = []
        for col in raw.columns:
            if col == "timestep" or col.startswith("Unnamed"):
                continue
            # Parse "neuron <group>.<offset>/<variable>"
            stripped = col[len("neuron "):] if col.startswith("neuron ") else col
            if "/" not in stripped:
                raise ValueError(
                    f"Unrecognised neuron trace column {col!r}; "
                    "expected 'neuron <group>.<offset>/<variable>'")
            neuron_id, var = stripped.rsplit("/", 1)
            for t, v in zip(raw["timestep"], raw[col]):
                long_rows.append(
                    {"timestep": t, "neuron_id": neuron_id, "var": var, "value": v})

        df = pd.DataFrame(long_rows)
        # Pivot to (timestep, neuron_id) index, one column per variable.
        return df.pivot_table(
            index=["timestep", "neuron_id"], columns="var", values="value")

    traces = _maybe_unwrap(source, "neuron_trace")
    if traces is None or (isinstance(traces, dict) and not traces):
        raise ValueError("No neuron trace data in source")
    if not isinstance(traces, dict):
        raise ValueError(
            f"Expected neuron_trace to be a dict[str, list[list]], got {type(traces).__name__}")

    # Stack each variable into a (T, N) array and check shapes agree.
    arrays: dict[str, np.ndarray] = {}
    n_timesteps: int | None = None
    n_neurons: int | None = None
    for var, values in traces.items():
        arr = np.asarray(values, dtype=float)
        if arr.ndim != 2:
            raise ValueError(
                f"neuron_trace[{var!r}] is not 2D (got shape {arr.shape})")
        if n_timesteps is None:
            n_timesteps, n_neurons = arr.shape
        elif arr.shape != (n_timesteps, n_neurons):
            raise ValueError(
                f"neuron_trace[{var!r}] has shape {arr.shape}, "
                f"expected ({n_timesteps}, {n_neurons})")
        arrays[var] = arr

    if n_neurons == 0:
        raise ValueError("Neuron traces contain no neurons")

    if neuron_ids is None:
        neuron_ids = [f"Neuron {i}" for i in range(n_neurons)]
    elif len(neuron_ids) != n_neurons:
        raise ValueError(
            f"neuron_ids has {len(neuron_ids)} entries but trace has {n_neurons} columns")

    index = pd.MultiIndex.from_product(
        [range(n_timesteps), list(neuron_ids)],
        names=["timestep", "neuron_id"],
    )
    return pd.DataFrame(
        {var: arr.reshape(-1) for var, arr in arrays.items()},
        index=index,
    )



[docs]
def performance_to_dataframe(source: Any) -> pd.DataFrame:
    """Convert a performance trace into a DataFrame.

    Args:
        source (pandas.DataFrame or Path or str or dict:
            The performance trace data. May be a :class:`pandas.DataFrame`,
            a path to a CSV file, the dict from chip.sim() or a dict containing
            performance data (dict[str, list], i.e. lists of per-timestep values
            for each metric).

    Returns:
        pandas.DataFrame: A DataFrame of per-timestep performance metrics.

    Raises:
        ValueError: If no performance trace data can be found in ``source``.
    """
    if isinstance(source, pd.DataFrame):
        return source

    if _is_path(source):
        return pd.read_csv(source)

    if _looks_like_perf_dict(source):
        return pd.DataFrame(source)

    perf = _maybe_unwrap(source, "perf_trace")
    if perf is None or (isinstance(perf, dict) and not perf):
        raise ValueError("No performance trace data in source")

    return pd.DataFrame(perf)




[docs]
def messages_to_dataframe(source: Any) -> pd.DataFrame:
    """Convert a message trace into a flat pandas DataFrame, one row per message.

    In addition, convenience identifier columns are derived when their
    components are present:

    * ``src_neuron`` from ``src_neuron_group_id`` and ``src_neuron_offset``
    * ``src_hw`` from ``src_tile_id`` and ``src_core_offset``
    * ``dest_hw`` from ``dest_tile_id`` and ``dest_core_offset``

    Args:
        source (pandas.DataFrame or Path or str or dict or list[list[dict]]):
            The message trace data. May be a :class:`pandas.DataFrame`, a path
            to a CSV file, the dict returned by ``chip.sim()``, or the raw
            ``message_trace`` values.

    Returns:
        pandas.DataFrame: A DataFrame with one row per message.

    Raises:
        ValueError: If no message trace data can be found in ``source``.
    """
    if isinstance(source, pd.DataFrame):
        return source

    if _is_path(source):
        return pd.read_csv(source)

    messages = _maybe_unwrap(source, "message_trace")
    if messages is None:
        raise ValueError("No message trace data in source")

    rows: list[dict] = []
    for ts_msgs in messages:
        for m in ts_msgs:
            row = dict(m)
            if "src_neuron_group_id" in row and "src_neuron_offset" in row:
                row.setdefault("src_neuron",
                               f"{row['src_neuron_group_id']}.{row['src_neuron_offset']}")
            if "src_tile_id" in row and "src_core_offset" in row:
                row.setdefault("src_hw",
                               f"{row['src_tile_id']}.{row['src_core_offset']}")
            if "dest_tile_id" in row and "dest_core_offset" in row:
                row.setdefault("dest_hw",
                               f"{row['dest_tile_id']}.{row['dest_core_offset']}")
            rows.append(row)
    return pd.DataFrame(rows)