diff options
author | northern-64bit <75195383+northern-64bit@users.noreply.github.com> | 2023-05-25 16:54:11 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-25 14:54:11 +0000 |
commit | 8e21b340e0b657e359de0e63a1a8032153a6c89b (patch) | |
tree | bb74eca55a3b5413a6caeab9b0258be76a4c519a | |
parent | e30da799aa5aa0900cbffc1ce455c28fd55cc2ca (diff) |
Adds test for collinearity to the `econometrics` menu (#5018)
* Add `vif` command to the ecomometrics menu
Adds `vif` command, which tests collinearity by calculating the variance inflation factor.
* Fix spelling
* Fix ruff
* Fix ruff
* Fix mypy + add sdk example
* Fix mypy
* Fix pylint
* Fix input + error handling
* Remove rouge print + error handling for one column
* fix linting
8 files changed, 194 insertions, 13 deletions
diff --git a/openbb_terminal/core/sdk/models/econometrics_sdk_model.py b/openbb_terminal/core/sdk/models/econometrics_sdk_model.py index 690dd7ca2d3..018d5de9fcf 100644 --- a/openbb_terminal/core/sdk/models/econometrics_sdk_model.py +++ b/openbb_terminal/core/sdk/models/econometrics_sdk_model.py @@ -39,6 +39,7 @@ class EconometricsRoot(Category): `re`: The random effects model is virtually identical to the pooled OLS model except that is accounts for the\n `root`: Calculate test statistics for unit roots\n `root_chart`: Determine the normality of a timeseries.\n + `vif`: Determine the vif, which tests for collinearity.\n """ _location_path = "econometrics" @@ -75,3 +76,4 @@ class EconometricsRoot(Category): self.re = lib.econometrics_regression_model.get_re self.root = lib.econometrics_model.get_root self.root_chart = lib.econometrics_view.display_root + self.vif = lib.econometrics_model.get_vif diff --git a/openbb_terminal/core/sdk/trail_map.csv b/openbb_terminal/core/sdk/trail_map.csv index eb85b6579d6..e339b496bbf 100644 --- a/openbb_terminal/core/sdk/trail_map.csv +++ b/openbb_terminal/core/sdk/trail_map.csv @@ -183,6 +183,7 @@ econometrics.panel,econometrics_regression_model.get_regressions_results,econome econometrics.pols,econometrics_regression_model.get_pols, econometrics.re,econometrics_regression_model.get_re, econometrics.root,econometrics_model.get_root,econometrics_view.display_root +econometrics.vif,econometrics_model.get_vif, econometrics.garch,econometrics_model.get_garch,econometrics_view.display_garch economy.available_indices,economy_yfinance_model.get_available_indices, economy.balance,economy_oecd_model.get_balance,economy_oecd_view.plot_balance diff --git a/openbb_terminal/econometrics/econometrics_controller.py b/openbb_terminal/econometrics/econometrics_controller.py index 8602eeabec2..03b1a9a538d 100644 --- a/openbb_terminal/econometrics/econometrics_controller.py +++ b/openbb_terminal/econometrics/econometrics_controller.py @@ -77,6 +77,7 @@ class EconometricsController(BaseController): "garch", "granger", "coint", + "vif", ] CHOICES_MENUS: List[str] = [ "qa", @@ -189,6 +190,7 @@ class EconometricsController(BaseController): "corr", "season", "lag", + "vif", ]: choices[feature] = dict() @@ -235,12 +237,14 @@ class EconometricsController(BaseController): ]: self.choices[feature] = {c: {} for c in self.files} - self.choices["type"] = { - c: {} for c in self.files + list(dataset_columns.keys()) - } - self.choices["desc"] = { - c: {} for c in self.files + list(dataset_columns.keys()) - } + for feature in ["type", "desc", "vif"]: + self.choices[feature] = { + c: {} for c in self.files + list(dataset_columns.keys()) + } + self.choices["vif"] = dict( + self.choices["vif"], + **{"-d": self.choices["vif"], "--data": self.choices["vif"]}, + ) pairs_timeseries = list() for dataset_col in list(dataset_columns.keys()): @@ -286,20 +290,21 @@ class EconometricsController(BaseController): mt.add_cmd("lag", self.files) mt.add_cmd("ret", self.files) mt.add_cmd("export", self.files) - mt.add_info("time_series_") + mt.add_info("_assumption_testing_") mt.add_cmd("norm", self.files) - mt.add_cmd("ols", self.files) mt.add_cmd("granger", self.files) mt.add_cmd("root", self.files) mt.add_cmd("coint", self.files) + mt.add_cmd("vif", self.files) + mt.add_cmd("dwat", self.files and self.regression["OLS"]["model"]) + mt.add_cmd("bgod", self.files and self.regression["OLS"]["model"]) + mt.add_cmd("bpag", self.files and self.regression["OLS"]["model"]) + mt.add_info("_time_series_") + mt.add_cmd("ols", self.files) mt.add_cmd("garch", self.files) mt.add_info("_panel_") mt.add_cmd("panel", self.files) mt.add_cmd("compare", self.files) - mt.add_info("_residuals_") - mt.add_cmd("dwat", self.files and self.regression["OLS"]["model"]) - mt.add_cmd("bgod", self.files and self.regression["OLS"]["model"]) - mt.add_cmd("bpag", self.files and self.regression["OLS"]["model"]) console.print(text=mt.menu_text, menu="Econometrics") console.print() @@ -1884,7 +1889,7 @@ class EconometricsController(BaseController): formatter_class=argparse.ArgumentDefaultsHelpFormatter, prog="bgod", description=( - "Show Breusch-Godfrey autocorrelation test results." + "Show Breusch-Godfrey autocorrelation test results. " "Needs OLS to be run in advance with independent and dependent variables" ), ) @@ -2182,3 +2187,78 @@ class EconometricsController(BaseController): console.print( "[red]More than one dataset.column must be provided.\n[/red]" ) + + @log_start_end(log=logger) + def call_vif(self, other_args: List[str]): + """Process vif command""" + parser = argparse.ArgumentParser( + add_help=False, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + prog="vif", + description=r"""Calculates VIF (variance inflation factor), which tests collinearity. + + It quantifies the severity of multicollinearity in an ordinary least squares regression analysis. The square + root of the variance inflation factor indicates how much larger the standard error increases compared to if + that variable had 0 correlation to other predictor variables in the model. + + It is defined as: + + $ VIF_i = 1 / (1 - R_i^2) $ + where $ R_i $ is the coefficient of determination of the regression equation with the column i being the + result from the i:th series being the exogenous variable. + + A VIF over 5 indicates a high collinearity and correlation. Values over 10 indicates causes problems, + while a value of 1 indicates no correlation. Thus VIF values between 1 and 5 are most commonly considered + acceptable. In order to improve the results one can often remove a column with high VIF. + + For further information see: https://en.wikipedia.org/wiki/Variance_inflation_factor""", + ) + parser.add_argument( + "-d", + "--data", + help="The datasets and columns we want to add <dataset>,<dataset2.column>,<dataset2.column2>", + dest="data", + type=check_list_values(self.choices["vif"]), + default=None, + ) + if other_args and "-" not in other_args[0][0]: + other_args.insert(0, "-d") + ns_parser = self.parse_known_args_and_warn( + parser, other_args, EXPORT_ONLY_RAW_DATA_ALLOWED + ) + + data = pd.DataFrame() + if ns_parser: + if ns_parser.data is None: + console.print("[red]Please enter a dataset to calculate vif for.[/red]") + return + if len(ns_parser.data) == 1 and "." in ns_parser.data[0]: + console.print( + "[red]Please enter at least a dataset or two columns to calculate vif for." + "vif can only be calculated for at least two columns.[/red]" + ) + for option in ns_parser.data: + if "." in option: + dataset, column = option.split(".") + else: + dataset = option + column = None + + if dataset not in self.datasets: + console.print( + f"[red]Not able to find the dataset {dataset}. Please choose one of " + f"the following: {', '.join(self.datasets)}[/red]" + ) + elif column is not None: + if column not in self.datasets[dataset]: + console.print( + f"[red]Not able to find the column {column}. Please choose one of " + f"the following: {', '.join(self.datasets[dataset].data)}[/red]" + ) + else: + data[f"{dataset}_{column}"] = self.datasets[dataset][column] + else: + for column in list(self.datasets[dataset].columns): + data[f"{dataset}_{column}"] = self.datasets[dataset][column] + + econometrics_view.display_vif(data) diff --git a/openbb_terminal/econometrics/econometrics_model.py b/openbb_terminal/econometrics/econometrics_model.py index fdcabb4c68e..b1c6dde473a 100644 --- a/openbb_terminal/econometrics/econometrics_model.py +++ b/openbb_terminal/econometrics/econometrics_model.py @@ -13,6 +13,8 @@ import pandas as pd import statsmodels.api as sm from arch import arch_model from scipy import stats +from statsmodels.stats.outliers_influence import variance_inflation_factor +from statsmodels.tools.tools import add_constant from statsmodels.tsa.stattools import adfuller, grangercausalitytests, kpss from openbb_terminal.rich_config import console @@ -482,3 +484,51 @@ def get_engle_granger_two_step_cointegration_test( adfstat, pvalue, _, _, _ = adfuller(z, maxlag=1, autolag=None) return c, gamma, alpha, z, adfstat, pvalue + + +def get_vif(dataset: pd.DataFrame, columns: Optional[list] = None) -> pd.DataFrame: + r"""Calculates VIF (variance inflation factor), which tests collinearity. + + It quantifies the severity of multicollinearity in an ordinary least squares regression analysis. The square + root of the variance inflation factor indicates how much larger the standard error increases compared to if + that variable had 0 correlation to other predictor variables in the model. + + It is defined as: + + $ VIF_i = 1 / (1 - R_i^2) $ + where $ R_i $ is the coefficient of determination of the regression equation with the column i being the result + from the i:th series being the exogenous variable. + + A VIF over 5 indicates a high collinearity and correlation. Values over 10 indicates causes problems, while a + value of 1 indicates no correlation. Thus VIF values between 1 and 5 are most commonly considered acceptable. + In order to improve the results one can often remove a column with high VIF. + + For further information see: https://en.wikipedia.org/wiki/Variance_inflation_factor + + Parameters + ---------- + dataset: pd.Series + Dataset to calculate VIF on + columns: Optional[list] + The columns to calculate to test for collinearity + + Returns + ------- + pd.DataFrame + Dataframe with the resulting VIF values for the selected columns + Examples + -------- + >>> from openbb_terminal.sdk import openbb + >>> longley = openbb.econometrics.load("longley") + >>> openbb.econometrics.vif(longley, ["TOTEMP","UNEMP","ARMED"]) + """ + df = add_constant(dataset if columns is None else dataset[columns]) + vif = pd.DataFrame( + { + "VIF Values": [ + variance_inflation_factor(df.values, i) for i in range(df.shape[1]) + ][1:] + }, + index=df.columns[1:], + ) + return vif diff --git a/openbb_terminal/econometrics/econometrics_view.py b/openbb_terminal/econometrics/econometrics_view.py index c13342c8944..6bebc81065a 100644 --- a/openbb_terminal/econometrics/econometrics_view.py +++ b/openbb_terminal/econometrics/econometrics_view.py @@ -621,3 +621,47 @@ def display_cointegration_test( return fig.show(external=external_axes) return None + + +@log_start_end(log=logger) +def display_vif( + dataset: pd.DataFrame, + columns: Optional[list] = None, + export: str = "", + sheet_name: Optional[str] = None, +): + """Displays the VIF (variance inflation factor), which tests for collinearity, values for each column. + + Parameters + ---------- + dataset: pd.Series + Dataset to calculate VIF on + columns: Optional[list] + The columns to calculate to test for collinearity + sheet_name: Optional[str] + Optionally specify the name of the sheet the data is exported to. + export: str + Format to export data. + """ + columns = dataset.columns if columns is None else columns + if any(dataset[column].dtype not in [int, float] for column in columns): + console.print( + "All column types must be numeric. Consider using the command 'type' to change this.\n" + ) + else: + results = econometrics_model.get_vif(dataset, columns) + + print_rich_table( + results, + headers=list(results.columns), + show_index=True, + title="Collinearity Test", + ) + + export_data( + export, + os.path.dirname(os.path.abspath(__file__)), + f"{dataset}_{','.join(columns)}_vif", + results, + sheet_name, + ) diff --git a/openbb_terminal/miscellaneous/i18n/en.yml b/openbb_terminal/miscellaneous/i18n/en.yml index f06570a2d4f..71633e8f9ce 100644 --- a/openbb_terminal/miscellaneous/i18n/en.yml +++ b/openbb_terminal/miscellaneous/i18n/en.yml @@ -981,6 +981,8 @@ en: econometrics/coint: co-integration test on a multitude of columns econometrics/garch: estimate future volatility with GARCH econometrics/ret: calculate returns for the given time series + econometrics/vif: perform collinearity test (VIF) + econometrics/_assumption_testing_: Assumption Testing portfolio/bro: brokers holdings supports robinhood, ally, degiro, coinbase portfolio/po: portfolio optimization optimize your portfolio weights efficiently portfolio/load: load transactions into the portfolio (use load --example for an example) diff --git a/openbb_terminal/miscellaneous/integration_tests_scripts/econometrics/test_econometrics.openbb b/openbb_terminal/miscellaneous/integration_tests_scripts/econometrics/test_econometrics.openbb index 2e0ae729b31..6f9bbd6d662 100644 --- a/openbb_terminal/miscellaneous/integration_tests_scripts/econometrics/test_econometrics.openbb +++ b/openbb_terminal/miscellaneous/integration_tests_scripts/econometrics/test_econometrics.openbb @@ -51,6 +51,7 @@ panel -d wp.lwage -i wp.black,wp.hisp,wp.exper,wp.married,wp.educ,wp.union -r bo load longley -a ll ols -d ll.totemp -i ll.gnpdefl,ll.gnp,ll.unemp,ll.armed,ll.pop,ll.year +vif ll.totemp,ll.unemp,ll.armed ## Regression Tests diff --git a/openbb_terminal/sdk.py b/openbb_terminal/sdk.py index d41133a5b38..74bb632fa43 100644 --- a/openbb_terminal/sdk.py +++ b/openbb_terminal/sdk.py @@ -140,6 +140,7 @@ class OpenBBSDK: `re`: The random effects model is virtually identical to the pooled OLS model except that is accounts for the\n `root`: Calculate test statistics for unit roots\n `root_chart`: Determine the normality of a timeseries.\n + `vif`: Determine the vif, which tests for collinearity.\n """ return model.EconometricsRoot() |