Source code for pyprocessta.eda.statistics

# -*- coding: utf-8 -*-
from collections import defaultdict
from typing import List

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, grangercausalitytests, kpss

__all__ = [
    "check_stationarity",
    "check_granger_causality",
    "computer_granger_causality_matrix",
]


[docs]def check_stationarity( series: pd.Series, threshold: float = 0.05, regression="c" ) -> dict: """Performs the Augmented-Dickey fuller and Kwiatkowski-Phillips-Schmidt-Shin (KPSS) tests for stationarity. Args: series (pd.Series): Time series data threshold (float, optional): p-value thresholds for the statistical tests. Defaults to 0.05. regression (str, optional): If regression="c" then the tests check for stationarity around a constant. For "ct" the test check for stationarity around a trend. Defaults to "c". Returns: dict: Results dictionary with key "stationary" that has a bool as value """ assert regression in ["c", "ct"] adf_results = adfuller(series, regression=regression) kpss_results = kpss(series, regression=regression, nlags="auto") # null hypothesis for ADF is non-sationarity for KPSS null hypothesis is stationarity conclusion = (kpss_results[1] > threshold) & (adf_results[1] < threshold) results = { "adf": { "statistic": adf_results[0], "p_value": adf_results[1], "stationary": adf_results[1] < threshold, }, "kpss": { "statistic": kpss_results[0], "p_value": kpss_results[1], "stationary": kpss_results[1] > threshold, }, "stationary": conclusion, } return results
[docs]def check_granger_causality( x: pd.Series, y: pd.Series, max_lag: int = 20, add_constant: bool = True ) -> dict: """Check if series x is Granger causal for series y We reject the null hypothesis that x does *not* Granger cause y if the pvalues are below a desired size of the test. Args: x (pd.Series): Time series. y (pd.Series): Time series. max_lag (int, optional): Maximum lag to use for the causality checks. Defaults to 20. add_constant (bool, optional): [description]. Defaults to True. Returns: dict: results dictionary """ results = {} test_result = grangercausalitytests( np.hstack([x.values.reshape(-1, 1), y.values.reshape(-1, 1)]), maxlag=max_lag, addconst=add_constant, verbose=False, ) results["detail"] = test_result p_values = [] for _, v in test_result.items(): p_values.append(v[0]["ssr_chi2test"][1]) results["min_p_value"] = min(p_values) results["lag_w_min_p_value"] = np.argmin(p_values) return results
def computer_granger_causality_matrix( df: pd.DataFrame, xs: List[str], ys: List[str] ) -> pd.DataFrame: results_matrix = defaultdict(list) for x in xs: for y in ys: results_matrix[x].append( check_granger_causality(df[x], df[y])["min_p_value"] ) return pd.DataFrame.from_dict(results_matrix, orient="index", columns=ys)