Load packages#
try:
import davos
except ModuleNotFoundError:
%pip install davos
import davos
Collecting davos
Obtaining dependency information for davos from https://files.pythonhosted.org/packages/98/dd/2c948b689686d3c3f00947d80b49d3731ae0b25f66ef826910ae203312bf/davos-0.2.3-py3-none-any.whl.metadata
Downloading davos-0.2.3-py3-none-any.whl.metadata (49 kB)
?25l ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/49.5 kB ? eta -:--:--
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.5/49.5 kB 1.1 MB/s eta 0:00:00
?25h
Requirement already satisfied: packaging in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from davos) (23.1)
Downloading davos-0.2.3-py3-none-any.whl (102 kB)
?25l ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/102.2 kB ? eta -:--:--
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━ 92.2/102.2 kB 9.5 MB/s eta 0:00:01
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 102.2/102.2 kB 2.3 MB/s eta 0:00:00
?25h
Installing collected packages: davos
Successfully installed davos-0.2.3
[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/project.py:896: UserWarning: Failed to identify notebook path. Falling back to generic default project
warnings.warn(
from collections.abc smuggle Iterable
smuggle numpy as np # pip: numpy==1.26.0
smuggle pandas as pd # pip: pandas==2.1.1
from scipy smuggle stats # pip: scipy==1.11.2
Collecting numpy==1.26.0
Obtaining dependency information for numpy==1.26.0 from https://files.pythonhosted.org/packages/35/21/9e150d654da358beb29fe216f339dc17f2b2ac13fff2a89669401a910550/numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
Downloading numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (99 kB)
?25l ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/99.7 kB ? eta -:--:--
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━ 92.2/99.7 kB 6.2 MB/s eta 0:00:01
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.7/99.7 kB 2.3 MB/s eta 0:00:00
?25hDownloading numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
?25l ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/14.0 MB ? eta -:--:--
━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/14.0 MB 49.6 MB/s eta 0:00:01
━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━ 6.5/14.0 MB 84.3 MB/s eta 0:00:01
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━ 11.1/14.0 MB 112.5 MB/s eta 0:00:01
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 14.0/14.0 MB 118.2 MB/s eta 0:00:01
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.0/14.0 MB 85.3 MB/s eta 0:00:00
?25h
Installing collected packages: numpy
Successfully installed numpy-1.26.0
[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: pip install --upgrade pip
---------------------------------------------------------------------------
StopIteration Traceback (most recent call last)
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/importlib/metadata/__init__.py:563, in Distribution.from_name(cls, name)
562 try:
--> 563 return next(cls.discover(name=name))
564 except StopIteration:
StopIteration:
During handling of the above exception, another exception occurred:
PackageNotFoundError Traceback (most recent call last)
Cell In[2], line 3
1 smuggle(name="collections.abc.Iterable", as_="Iterable")
----> 3 smuggle(name="numpy", as_="np", installer="pip", args_str="""numpy==1.26.0""", installer_kwargs={'editable': False, 'spec': 'numpy==1.26.0'})
4 smuggle(name="pandas", as_="pd", installer="pip", args_str="""pandas==2.1.1""", installer_kwargs={'editable': False, 'spec': 'pandas==2.1.1'})
5 smuggle(name="scipy.stats", as_="stats", installer="pip", args_str="""scipy==1.11.2""", installer_kwargs={'editable': False, 'spec': 'scipy==1.11.2'})
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/core.py:979, in use_project.<locals>.smuggle_wrapper(*args, **kwargs)
977 importlib.invalidate_caches()
978 try:
--> 979 return smuggle_func(*args, **kwargs)
980 finally:
981 # after (possibly installing and) loading the package,
982 # remove the project's site-packages directory from
(...)
986 # other dirs being prepended to load modules installed
987 # in custom locations
988 sys.path.remove(str(project.site_packages_dir))
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/core.py:1091, in smuggle(name, as_, installer, args_str, installer_kwargs)
1087 importlib.reload(sys.modules['pkg_resources'])
1088 # check whether the smuggled package and/or any
1089 # installed/updated dependencies were already imported during
1090 # the current runtime
-> 1091 prev_imported_pkgs = get_previously_imported_pkgs(installer_stdout,
1092 onion.installer)
1093 # if the smuggled package was previously imported, deal with
1094 # it last so it's reloaded after its dependencies are in place
1095 try:
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/core.py:257, in get_previously_imported_pkgs(install_cmd_stdout, installer)
255 pkg_name = dist_name.rsplit('-', maxsplit=1)[0]
256 # use the install name to get the package distribution object
--> 257 dist = metadata.distribution(pkg_name)
258 # check the distribution's metadata for a file containing
259 # top-level import names. Also includes names of namespace
260 # packages (e.g. mpl_toolkits from matplotlib), if any.
261 toplevel_names = dist.read_text('top_level.txt')
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/importlib/metadata/__init__.py:981, in distribution(distribution_name)
975 def distribution(distribution_name):
976 """Get the ``Distribution`` instance for the named package.
977
978 :param distribution_name: The name of the distribution package as a string.
979 :return: A ``Distribution`` instance (or subclass thereof).
980 """
--> 981 return Distribution.from_name(distribution_name)
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/importlib/metadata/__init__.py:565, in Distribution.from_name(cls, name)
563 return next(cls.discover(name=name))
564 except StopIteration:
--> 565 raise PackageNotFoundError(name)
PackageNotFoundError: No package metadata was found for numpy
Implement a GLM from scratch#
def requires_fit(method):
"""
Decorator used in GLM class to disallow running certain methods
before fitting the model.
"""
def check_fitted_wrapper(self, *args, **kwargs):
if not self.fitted:
raise ValueError(
f"Model must be fit before running self.{method.__name__}. "
"Fit the model first with self.fit()"
)
return method(self, *args, **kwargs)
return check_fitted_wrapper
class GLM:
def __init__(self, X, y, ensure_intercept=True, predictor_labels=None):
"""
Initialize a GLM instance to fit, inspect, and run tests.
Accepts a design matrix X and response vector y as lists/numpy
arrays or labeled pandas DataFrames/Series.
Parameters
----------
X : array-like
Design matrix with shape (n, p). `n` is the number of
observations and `p` is the number of predictors, optionally
including an intercept (see `ensure_intercept` below).
If X is a pandas DataFrame, column names will be used as
labels for the predictor variables.
y : array-like
Response vector with shape (n,).
ensure_intercept : bool, optional
If True (default), ensure that the first column of X
contains all 1's for the intercept, inserting one if
necessary. If False, do not include an intercept.
predictor_labels : list of str, optional
List of labels for the predictors in X. If None (default),
use column names from X if X is a pandas DataFrame,
otherwise use generic labels "X_0", "X_1", etc.
"""
if predictor_labels is None:
if isinstance(X, pd.DataFrame):
self.predictor_labels = X.columns.tolist()
elif isinstance(X, pd.Series):
self.predictor_labels = X.index.tolist()
else:
self.predictor_labels = [f"X_{i}" for i in range(X.shape[1])]
elif not isinstance(predictor_labels, Iterable):
raise TypeError("'predictor_labels' must be an iterable if passed")
elif len(predictor_labels) != X.shape[1]:
raise ValueError(
f"'predictor_labels' must contain {X.shape[1]} labels"
)
else:
self.predictor_labels = predictor_labels
X = np.asarray(X)
y = np.asarray(y).squeeze()
if X.shape[0] != y.shape[0]:
raise ValueError(
"X and y must have the same number of rows (observations)"
)
if y.ndim != 1:
raise ValueError("y must be a n x 1 vector of observed responses")
if ensure_intercept and not (X[:, 0] == 1).all():
self.X_ = np.hstack((np.ones((X.shape[0], 1)), X))
self.predictor_labels.insert(0, "Intercept")
else:
self.X_ = X
self.y_ = y
self.n_obs = X.shape[0]
self.n_predictors = self.X_.shape[1] # p includes intercept
self.df_model = self.n_predictors - 1
self.df_error = self.n_obs - self.n_predictors
self.fitted = False
def fit(self):
"""
Fit the model with the data provided at initialization. Both
returns the fitted model and updates it in place (a la
scikit-learn estimators).
Returns
-------
GLM
The fitted GLM instance
"""
gram_inv = np.linalg.inv(self.X_.T @ self.X_)
# beta-hat, shape: (self.n_predictors,)
self.betas = gram_inv @ self.X_.T @ self.y_
# y-hat, shape: (self.n_obs,)
# also alias y_hat as y_pred (standard scikit-learn convention)
self.y_hat = self.y_pred = self.X_ @ self.betas
# residuals, shape: (self.n_obs,)
self.residuals = self.y_ - self.y_hat
self.ss_res = (self.residuals ** 2).sum()
self.ss_tot = ((self.y_ - self.y_.mean()) ** 2).sum()
self.R2 = 1 - self.ss_res / self.ss_tot
self.AIC = self.n_obs * np.log(self.ss_res / self.n_obs) + 2 * self.n_predictors
# variance/covariance matrix for betas,
# shape: (self.n_predictors, self.n_predictors)
self.beta_cov = self.ss_res / self.df_error * gram_inv
# standard errors for betas, shape: (self.n_predictors,)
self.beta_ses = np.diag(self.beta_cov) ** 0.5
# t-statistics for betas, shape: (self.n_predictors,)
self.beta_ts = self.betas / self.beta_ses
# (2-tailed) p-values for t-tests for betas,
# shape: (self.n_predictors,)
self.beta_ps = 2 * stats.t.sf(np.abs(self.beta_ts), self.df_error)
# set fitted flag to True to enable other methods that require
# these computations first
self.fitted = True
return self
@requires_fit
def predict(self, X):
"""
Predict the value of the response variable for new data.
Parameters
----------
X : array-like
Array containing with the same number of predictors as the
design matrix used to fit the model. May contain any number
of observations.
Returns
-------
y_pred : 1D array
Predicted values of the response variable for each row of X.
"""
X = np.atleast_2d(np.asarray(X))
if X.shape[1] != self.n_predictors:
raise ValueError(
f"X must have {self.n_predictors} columns (predictors)"
)
return X @ self.betas
@requires_fit
def summary(self):
"""Print a formatted summary table of model results."""
print("General Linear Model")
print("===================")
print(f"Number of observations: {self.n_obs}")
print(f"Number of predictors: {self.n_predictors}")
print(f"Model degrees of freedom: {self.df_model}")
print(f"Error degrees of freedom: {self.df_error}")
print(f"Residual standard error: {self.ss_res:.3f}")
print(f"R^2: {self.R2:.3f}")
print(f"AIC: {self.AIC:.3f}")
print("\nCoefficients:")
print("-------------")
# Format the table width to fit the longest predictor name
# (min width: 4 cols)
name_col_width = max(len(max(self.predictor_labels, key=len)), 4)
header = "\033[1m{:<{width}}{:>8}{:>8}{:>8}{:>8}\033[0m".format(
"Name", "B̂", "SE", "t", "p", width=name_col_width
)
print(header)
for i in range(self.n_predictors):
label = self.predictor_labels[i]
beta = self.betas[i]
se = self.beta_ses[i]
t = self.beta_ts[i]
p = self.beta_ps[i]
if p < 0.001:
sig = '***'
elif p < 0.01:
sig = '**'
elif p < 0.05:
sig = '*'
else:
sig = ''
print("{:<{width}}{:>8.3f}{:>8.3f}{:>8.3f}{:>8.3f} \033[1m{sig}\033[0m".format(
label, beta, se, t, p, width=name_col_width, sig=sig)
)
@requires_fit
def t_test(self, predictor):
"""
Two-tailed t-test for the null hypothesis that the estimated
coefficient for a given predictor is not significantly
different from zero.
Parameters
----------
predictor : int or str
Index or label of predictor for which to run the t-test.
Note that the 0th predictor refers to the intercept.
Returns
-------
t, p
t-statistic and associated p-value as a tuple.
"""
if isinstance(predictor, str):
predictor = self.predictor_labels.index(predictor)
elif not isinstance(predictor, int):
raise TypeError("'predictor' must be an int or str")
return self.beta_ts[predictor], self.beta_ps[predictor]
@requires_fit
def test_contrast(self, contrast):
"""
Two-tailed t-test for the null hypothesis that the given
contrast of predictors is not significantly different from zero.
Parameters
----------
contrast : array-like
Contrast vector of shape (p,). p is the number of predictors
(including the intercept).
Returns
-------
t, p
t-statistic and associated p-value as a tuple.
"""
contrast = np.asarray(contrast).squeeze()
if contrast.ndim != 1:
raise ValueError("'contrast' must be a 1D array")
if contrast.shape[0] != self.n_predictors:
raise ValueError(
f"'contrast' must have {self.n_predictors} elements"
)
# compute t-statistic
# note: contrast matrices not transposed here because numpy
# automatically does so for 1D vectors
t_stat = (
(contrast @ self.betas) /
np.sqrt(contrast @ self.beta_cov @ contrast)
)
# compute (2-tailed) p-value
p_value = 2 * stats.t.sf(np.abs(t_stat), self.df_error)
return t_stat, p_value
@requires_fit
def f_test(self, predictors=None):
"""
F-test for the null hypothesis that the given (set of)
predictor(s) has no effect on the response variable (i.e., that
the quality of the model fit would not significantly worsen if
the given predictor(s) were removed).
Parameters
----------
predictors : int, str, or iterable of int or str, optional
index/indices or label(s) of predictor(s) for which to run
the F-test. Note that the 0th predictor refers to the
intercept. If None (default), the F-test is run on all
predictors (i.e., an intercept-only model).
Returns
-------
F, p: tuple of float
F-statistic and associated p-value as a tuple.
"""
if predictors is None:
predictors = predictors = list(range(1, self.n_predictors))
elif isinstance(predictors, int):
predictors = [predictors]
elif isinstance(predictors, str):
predictors = [self.predictor_labels.index(predictors)]
elif isinstance(predictors, Iterable):
_predictors = []
for p in predictors:
if isinstance(p, str):
_predictors.append(self.predictor_labels.index(p))
elif isinstance(p, int):
_predictors.append(p)
else:
raise TypeError(
"'predictors' must be an int, str, or iterable of int "
"or str"
)
predictors = _predictors
if not all(p in range(self.n_predictors) for p in predictors):
raise ValueError("'predictors' must be a valid predictor index or "
"label"
)
if len(predictors) == 0:
raise ValueError("'predictors' must contain at least one index or "
"label"
)
# get sum of squared residuals for reduced model
X_reduced = np.delete(self.X_, predictors, axis=1)
betas_reduced = np.linalg.inv(X_reduced.T @ X_reduced) @ X_reduced.T @ self.y_
residuals_reduced = self.y_ - X_reduced @ betas_reduced
ssr_reduced = (residuals_reduced ** 2).sum()
# compute F-statistic
ssr_full = self.ss_res
p = self.n_predictors
q = len(predictors)
n = self.n_obs
f_stat = ((ssr_reduced - ssr_full) / (p - q)) / (ssr_full / (n - p))
# compute p-value
p_value = stats.f.sf(f_stat, p - q, n - p)
return f_stat, p_value
Load in (and clean) some data to test it out!#
# load dataset from GitHub
dataset_url = (
"https://github.com/torwager/ComputationalFoundations/raw/"
"79b92a3/CompFound/datasets/Ashar2022_outcomes_demographics.csv"
)
df_raw = pd.read_csv(dataset_url, na_values={'current_opioid_use': '<undefined>'})
# drop individuals that don't have data for both sessions
dupes_only = df_raw.loc[df_raw.duplicated('id', keep=False)]
# make sure each individual's data actually comes from 2 different
# sessions (i.e., no duplicates or mislabels)
for i in dupes_only['id'].unique():
assert set(dupes_only.query('id == @i')['time']) == {1, 2}
# reshape to have one row per participant with data from both sessions
# pivot on columns with session-agnostic values (age, gender, etc.)
same_val_cols = (
['id', 'group'] +
list(dupes_only.columns[list(dupes_only.columns).index('education'):])
)
df = dupes_only.pivot(index=same_val_cols, columns='time')
df.columns = df.columns.map(
lambda x: f'{x[0]}_{"before" if x[1] == 1 else "after"}'
)
df = df.reset_index()
# deal with columns for data that wasn't collected in both sessions
all_nan_cols = df.columns[df.isnull().sum(axis=0) == len(df)]
df = df.drop(columns=all_nan_cols)
for colname in all_nan_cols:
base_colname, _, suffix = colname.rpartition('_')
other_suffix = 'before' if suffix == 'after' else 'after'
df = df.rename(columns={f'{base_colname}_{other_suffix}': base_colname})
# add column for change in pain level
# (this will be the response variable for the model)
df['bpi_intensity_change'] = df['bpi_intensity_before'] - df['bpi_intensity_after']
# display dataset
df
id | group | education | ethnicity | hispanic | employment_status | exercise | handedness | sses | married_or_living_as_marri | ... | sopa_emo_after | pgic | tx_satisfaction | alcohol_before | alcohol_after | opioid_before | opioid_after | cannabis_before | cannabis_after | bpi_intensity_change | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12 | 3.0 | 2.0 | 4.0 | 0.0 | 1.0 | 3.0 | 1.0 | 4.0 | 0.0 | ... | 8.0 | 1.0 | 52.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.000 | 0.000 | -0.25 |
1 | 14 | 3.0 | 3.0 | 4.0 | 0.0 | 3.0 | 4.0 | 1.0 | 9.0 | 0.0 | ... | 6.0 | 1.0 | 30.5 | 23.0 | 21.0 | 0.0 | 0.0 | 3.500 | 3.500 | 0.50 |
2 | 15 | 3.0 | 2.0 | 5.0 | 0.0 | 1.0 | 3.0 | 1.0 | 6.0 | 1.0 | ... | 4.0 | 1.0 | 50.5 | 15.0 | 29.0 | 0.0 | 0.0 | 0.000 | 0.000 | -0.50 |
3 | 18 | 2.0 | 3.0 | 4.0 | 0.0 | 1.0 | 3.0 | 1.0 | 5.0 | 1.0 | ... | 9.0 | 5.0 | 50.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.000 | 0.000 | 2.25 |
4 | 23 | 2.0 | 3.0 | 2.0 | 0.0 | 1.0 | 5.0 | 1.0 | 6.0 | 0.0 | ... | 8.0 | 1.0 | 17.5 | 29.0 | 13.5 | 0.0 | 0.0 | 0.015 | 0.025 | 0.25 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
130 | 1268 | 2.0 | 3.0 | 4.0 | 0.0 | 2.0 | 4.0 | 1.0 | 8.0 | 1.0 | ... | 8.0 | 5.0 | 88.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.000 | 0.000 | 1.75 |
131 | 1277 | 2.0 | 3.0 | 4.0 | 0.0 | 1.0 | 3.0 | 1.0 | 8.0 | 1.0 | ... | 10.0 | 5.0 | 46.0 | 23.0 | 30.0 | 0.0 | 0.0 | 0.000 | 0.000 | -0.75 |
132 | 1294 | 2.0 | 3.0 | 4.0 | 0.0 | 1.0 | 3.0 | 1.0 | 6.0 | 0.0 | ... | 8.0 | 2.0 | 54.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000 | 0.00 |
133 | 1302 | 3.0 | 3.0 | 4.0 | 0.0 | 1.0 | 2.0 | 1.0 | 8.0 | 0.0 | ... | 6.0 | 2.0 | 64.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.000 | 0.000 | 0.00 |
134 | 1319 | 2.0 | 3.0 | 5.0 | 0.0 | 1.0 | 3.0 | 1.0 | 5.0 | 0.0 | ... | 5.0 | 5.0 | 69.0 | 11.0 | 3.0 | 2.0 | 1.0 | 7.000 | 19.000 | 0.75 |
135 rows × 51 columns
select a subset of variables to use as predictors#
predictors = [
'group', 'education', 'ethnicity', 'hispanic', 'employment_status',
'exercise', 'handedness', 'sses', 'married_or_living_as_marri', 'age',
'weight', 'gender', 'backpain_length'
]
X = df[predictors]
y = df['bpi_intensity_change']
fit the model and display results#
glm = GLM(X, y).fit()
glm.summary()
General Linear Model
===================
Number of observations: 135
Number of predictors: 14
Model degrees of freedom: 13
Error degrees of freedom: 121
Residual standard error: 193.674
R^2: 0.399
AIC: 76.722
Coefficients:
-------------
Name B̂ SE t p
Intercept 5.284 1.641 3.220 0.002 **
group -1.054 0.137 -7.670 0.000 ***
education -0.543 0.278 -1.955 0.053
ethnicity 0.058 0.219 0.264 0.792
hispanic -0.373 0.716 -0.522 0.603
employment_status -0.298 0.147 -2.019 0.046 *
exercise -0.121 0.124 -0.975 0.331
handedness 0.462 0.240 1.924 0.057
sses -0.047 0.068 -0.681 0.497
married_or_living_as_marri 0.119 0.254 0.466 0.642
age 0.009 0.009 1.033 0.304
weight -0.003 0.004 -0.881 0.380
gender 0.253 0.258 0.981 0.329
backpain_length -0.018 0.014 -1.270 0.206
run some individual t-tests#
print('education')
t, p = glm.t_test('education')
print(f"\t{t = }, {p = }")
print('employment_status')
t, p = glm.t_test('employment_status')
print(f"\t{t = }, {p = }")
education
t = -1.9546532038088111, p = 0.05293030529967579
employment_status
t = -2.0186211253690964, p = 0.04573738199608248
run some F-tests on a few subsets of predictors#
print('intercept-only model')
F, p = glm.f_test()
print(f"\t{F = }, {p = }\n")
print('["group", "employment_status", "age"]')
F, p = glm.f_test(["employment_status", "age"])
print(f"\t{F = }, {p = }")
intercept-only model
F = 80.22027070929019, p = 4.903937633539385e-15
["group", "employment_status", "age"]
F = 0.36459164573984326, p = 0.9733056577039912
test some contrasts#
group_vs_education = np.zeros(glm.n_predictors, dtype=int)
group_vs_education[[1, 2]] = [1, -1]
print(f"{group_vs_education = }")
t, p = glm.test_contrast(group_vs_education)
print(f"\t{t = }, {p = }\n")
ethnicity_vs_employment_status = np.zeros(glm.n_predictors, dtype=int)
ethnicity_vs_employment_status[[3, 5]] = [1, -1]
print(f"{ethnicity_vs_employment_status = }")
t, p = glm.test_contrast(ethnicity_vs_employment_status)
print(f"\t{t = }, {p = }")
group_vs_education = array([ 0, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
t = -1.7110376121312285, p = 0.0896365856306869
ethnicity_vs_employment_status = array([ 0, 0, 0, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0])
t = 1.382285772613835, p = 0.16943021507691455
predict response variable for some (fake) new data#
np.random.seed(0)
new_data = np.random.rand(3, glm.X_.shape[1])
glm.predict(new_data)
array([1.99317962, 0.11215594, 2.29967943])