Source code for pyprocessta.eda.statistics

# -*- coding: utf-8 -*-
from collections import defaultdict
from typing import List

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, grangercausalitytests, kpss

__all__ = [
    "check_stationarity",
    "check_granger_causality",
    "computer_granger_causality_matrix",
]


[docs]def check_stationarity(
    series: pd.Series, threshold: float = 0.05, regression="c"
) -> dict:
    """Performs the Augmented-Dickey fuller and Kwiatkowski-Phillips-Schmidt-Shin (KPSS) tests
    for stationarity.

    Args:
        series (pd.Series): Time series data
        threshold (float, optional): p-value thresholds for the statistical tests.
            Defaults to 0.05.
        regression (str, optional): If regression="c" then the tests check for stationarity around a constant.
            For "ct" the test check for stationarity around a trend.
            Defaults to "c".

    Returns:
        dict: Results dictionary with key "stationary" that has a bool as value
    """

    assert regression in ["c", "ct"]

    adf_results = adfuller(series, regression=regression)
    kpss_results = kpss(series, regression=regression, nlags="auto")

    # null hypothesis for ADF is non-sationarity for KPSS null hypothesis is stationarity
    conclusion = (kpss_results[1] > threshold) & (adf_results[1] < threshold)
    results = {
        "adf": {
            "statistic": adf_results[0],
            "p_value": adf_results[1],
            "stationary": adf_results[1] < threshold,
        },
        "kpss": {
            "statistic": kpss_results[0],
            "p_value": kpss_results[1],
            "stationary": kpss_results[1] > threshold,
        },
        "stationary": conclusion,
    }

    return results


[docs]def check_granger_causality(
    x: pd.Series, y: pd.Series, max_lag: int = 20, add_constant: bool = True
) -> dict:
    """Check if series x is Granger causal for series y
    We reject the null hypothesis that x does *not* Granger cause y
    if the pvalues are below a desired size of the test.

    Args:
        x (pd.Series): Time series.
        y (pd.Series): Time series.
        max_lag (int, optional): Maximum lag to use for the causality checks.
            Defaults to 20.
        add_constant (bool, optional): [description]. Defaults to True.

    Returns:
        dict: results dictionary
    """
    results = {}
    test_result = grangercausalitytests(
        np.hstack([x.values.reshape(-1, 1), y.values.reshape(-1, 1)]),
        maxlag=max_lag,
        addconst=add_constant,
        verbose=False,
    )
    results["detail"] = test_result
    p_values = []

    for _, v in test_result.items():
        p_values.append(v[0]["ssr_chi2test"][1])

    results["min_p_value"] = min(p_values)
    results["lag_w_min_p_value"] = np.argmin(p_values)
    return results


def computer_granger_causality_matrix(
    df: pd.DataFrame, xs: List[str], ys: List[str]
) -> pd.DataFrame:
    results_matrix = defaultdict(list)

    for x in xs:
        for y in ys:
            results_matrix[x].append(
                check_granger_causality(df[x], df[y])["min_p_value"]
            )

    return pd.DataFrame.from_dict(results_matrix, orient="index", columns=ys)
pyprocessta v0.1.0+114.gf131d2d.dirty documentation

Source code for pyprocessta.eda.statistics