Source code for pyprocessta.causalimpact

# -*- coding: utf-8 -*-
"""
Causal impact analysis uses machine learning to construct a counterfactual
(what would the results have been without an intervention)
which can be used to estimate an effect size without the need for a control group
A good introduction is
https://www.youtube.com/watch?v=GTgZfCltMm8
the original paper is
https://storage.googleapis.com/pub-tools-public-publication-data/pdf/41854.pdf.
The particular implementation we use was described in
https://towardsdatascience.com/implementing-causal-impact-on-top-of-tensorflow-probability-c837ea18b126

One can use covariates to build the counterfactual but one needs to be careful that
they are not changed by the intervention.
"""
from typing import List

import numpy as np
import pandas as pd
import tensorflow_probability as tfp
from causalimpact import CausalImpact
from causalimpact.misc import standardize
from causalimpact.model import build_bijector, build_inv_gamma_sd_prior

from ..eda.statistics import check_granger_causality

__all__ = ["run_causal_impact_analysis"]


def _select_unrelated_x(df, x_columns, intervention_column, p_value_threshold, lag=10):
    unrelated_x = []

    for x_column in x_columns:
        if x_column != intervention_column:
            granger_result = check_granger_causality(
                df[x_column], df[intervention_column], lag
            )
            if granger_result["min_p_value"] > p_value_threshold:
                unrelated_x.append(x_column)

    return unrelated_x


[docs]def run_causal_impact_analysis( df: pd.DataFrame, x_columns: List[str], intervention_column: str, y_column: str, start: List, end: List, p_value_threshold: float = 0.05, ) -> object: """Run the causal impact analysis. Here, we use all the x that are not related to the intervention variable. Args: df (pd.DataFrame): Dataframe to run the analysis on x_columns (List[str]): All column names that can potentially be used as covariates for the counterfactual model intervention_column (str): Name of the column on which the intervention has been performed y_column (str): Target column on which we want to understand the effect of the intervention start (List): Two elements defining the pre-intervention interval end (List): Two elements defining the post-intervention interval p_value_threshold (float): H0 that x does not Granger cause y is rejected when p smaller this threshold. Defaults to 0.05. Returns: object: causalimpact object """ # First, we need to remove all the columns from X that are somehow related to our intervention variable # this avoids that we bias the estimation of our causal effect by data leakage x_columns = _select_unrelated_x( df, x_columns, intervention_column, p_value_threshold ) new_data = df[[y_column] + x_columns] # model = _linear_trend_linear_reg_model(new_data, start, end) # now we can run the causal impact analysis ci = CausalImpact( new_data, start, end, # model=model, model_args={"standardize": True, "nseasons": 16, "prior_sd": 0.01}, ) return ci
def _linear_trend_linear_reg_model(df, start, end): df = df.astype(np.float32) normed_data, _ = standardize(df) obs_data = df.loc[start[0] : start[1]].iloc[:, 0] obs_sd = np.sqrt(obs_data.std()) sd_prior = build_inv_gamma_sd_prior(0.01) sd_prior = build_bijector(sd_prior) # This is an approximation to simulate the bsts package from R. It's expected that # given a few data points the posterior will converge appropriately given this # distribution, that's why it's divided by 2. obs_prior = build_inv_gamma_sd_prior(obs_sd / 2) obs_prior = build_bijector(obs_prior) local = tfp.sts.SemiLocalLinearTrend( observed_time_series=obs_data, level_scale_prior=sd_prior ) linear_reg = tfp.sts.SparseLinearRegression( design_matrix=normed_data.loc[start[0] : end[1]] .iloc[:, 1:] .values.reshape(-1, normed_data.shape[1] - 1) ) model = tfp.sts.Sum( [local, linear_reg], observed_time_series=obs_data, observation_noise_scale_prior=obs_prior, ) return model