Load packages

Load packages#

try:
    import davos
except ModuleNotFoundError:
    %pip install davos
    import davos

Collecting davos
  Obtaining dependency information for davos from https://files.pythonhosted.org/packages/98/dd/2c948b689686d3c3f00947d80b49d3731ae0b25f66ef826910ae203312bf/davos-0.2.3-py3-none-any.whl.metadata

  Downloading davos-0.2.3-py3-none-any.whl.metadata (49 kB)

?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/49.5 kB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.5/49.5 kB 1.1 MB/s eta 0:00:00
?25h

Requirement already satisfied: packaging in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from davos) (23.1)
Downloading davos-0.2.3-py3-none-any.whl (102 kB)
?25l   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/102.2 kB ? eta -:--:--

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━ 92.2/102.2 kB 9.5 MB/s eta 0:00:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 102.2/102.2 kB 2.3 MB/s eta 0:00:00
?25h

Installing collected packages: davos
Successfully installed davos-0.2.3

[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.

/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/project.py:896: UserWarning: Failed to identify notebook path. Falling back to generic default project
  warnings.warn(

from collections.abc smuggle Iterable

smuggle numpy as np                 # pip: numpy==1.26.0
smuggle pandas as pd                # pip: pandas==2.1.1
from scipy smuggle stats            # pip: scipy==1.11.2

Collecting numpy==1.26.0
  Obtaining dependency information for numpy==1.26.0 from https://files.pythonhosted.org/packages/35/21/9e150d654da358beb29fe216f339dc17f2b2ac13fff2a89669401a910550/numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl.metadata

  Downloading numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (99 kB)
?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/99.7 kB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━ 92.2/99.7 kB 6.2 MB/s eta 0:00:01

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.7/99.7 kB 2.3 MB/s eta 0:00:00
?25hDownloading numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
?25l   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/14.0 MB ? eta -:--:--

   ━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/14.0 MB 49.6 MB/s eta 0:00:01

   ━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━ 6.5/14.0 MB 84.3 MB/s eta 0:00:01

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━ 11.1/14.0 MB 112.5 MB/s eta 0:00:01

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 14.0/14.0 MB 118.2 MB/s eta 0:00:01
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.0/14.0 MB 85.3 MB/s eta 0:00:00
?25h

Installing collected packages: numpy

Successfully installed numpy-1.26.0

[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: pip install --upgrade pip

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/importlib/metadata/__init__.py:563, in Distribution.from_name(cls, name)
    562 try:
--> 563     return next(cls.discover(name=name))
    564 except StopIteration:

StopIteration: 

During handling of the above exception, another exception occurred:

PackageNotFoundError                      Traceback (most recent call last)
Cell In[2], line 3
      1 smuggle(name="collections.abc.Iterable", as_="Iterable")
----> 3 smuggle(name="numpy", as_="np", installer="pip", args_str="""numpy==1.26.0""", installer_kwargs={'editable': False, 'spec': 'numpy==1.26.0'})
      4 smuggle(name="pandas", as_="pd", installer="pip", args_str="""pandas==2.1.1""", installer_kwargs={'editable': False, 'spec': 'pandas==2.1.1'})
      5 smuggle(name="scipy.stats", as_="stats", installer="pip", args_str="""scipy==1.11.2""", installer_kwargs={'editable': False, 'spec': 'scipy==1.11.2'})

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/core.py:979, in use_project.<locals>.smuggle_wrapper(*args, **kwargs)
    977 importlib.invalidate_caches()
    978 try:
--> 979     return smuggle_func(*args, **kwargs)
    980 finally:
    981     # after (possibly installing and) loading the package,
    982     # remove the project's site-packages directory from
   (...)
    986     # other dirs being prepended to load modules installed
    987     # in custom locations
    988     sys.path.remove(str(project.site_packages_dir))

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/core.py:1091, in smuggle(name, as_, installer, args_str, installer_kwargs)
   1087     importlib.reload(sys.modules['pkg_resources'])
   1088 # check whether the smuggled package and/or any
   1089 # installed/updated dependencies were already imported during
   1090 # the current runtime
-> 1091 prev_imported_pkgs = get_previously_imported_pkgs(installer_stdout,
   1092                                                   onion.installer)
   1093 # if the smuggled package was previously imported, deal with
   1094 # it last so it's reloaded after its dependencies are in place
   1095 try:

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/davos/core/core.py:257, in get_previously_imported_pkgs(install_cmd_stdout, installer)
    255 pkg_name = dist_name.rsplit('-', maxsplit=1)[0]
    256 # use the install name to get the package distribution object
--> 257 dist = metadata.distribution(pkg_name)
    258 # check the distribution's metadata for a file containing
    259 # top-level import names. Also includes names of namespace
    260 # packages (e.g. mpl_toolkits from matplotlib), if any.
    261 toplevel_names = dist.read_text('top_level.txt')

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/importlib/metadata/__init__.py:981, in distribution(distribution_name)
    975 def distribution(distribution_name):
    976     """Get the ``Distribution`` instance for the named package.
    977 
    978     :param distribution_name: The name of the distribution package as a string.
    979     :return: A ``Distribution`` instance (or subclass thereof).
    980     """
--> 981     return Distribution.from_name(distribution_name)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/importlib/metadata/__init__.py:565, in Distribution.from_name(cls, name)
    563     return next(cls.discover(name=name))
    564 except StopIteration:
--> 565     raise PackageNotFoundError(name)

PackageNotFoundError: No package metadata was found for numpy

Implement a GLM from scratch#

def requires_fit(method):
    """
    Decorator used in GLM class to disallow running certain methods 
    before fitting the model.
    """
    def check_fitted_wrapper(self, *args, **kwargs):
        if not self.fitted:
            raise ValueError(
                f"Model must be fit before running self.{method.__name__}. "
                "Fit the model first with self.fit()"
            )
        return method(self, *args, **kwargs)
    return check_fitted_wrapper

class GLM:
    def __init__(self, X, y, ensure_intercept=True, predictor_labels=None):
        """
        Initialize a GLM instance to fit, inspect, and run tests.
        Accepts a design matrix X and response vector y as lists/numpy
        arrays or labeled pandas DataFrames/Series.

        Parameters
        ----------
        X : array-like
            Design matrix with shape (n, p). `n` is the number of
            observations and `p` is the number of predictors, optionally
            including an intercept (see `ensure_intercept` below).
            If X is a pandas DataFrame, column names will be used as
            labels for the predictor variables.
        y : array-like
            Response vector with shape (n,).
        ensure_intercept : bool, optional
            If True (default), ensure that the first column of X
            contains all 1's for the intercept, inserting one if
            necessary. If False, do not include an intercept.
        predictor_labels : list of str, optional
            List of labels for the predictors in X. If None (default),
            use column names from X if X is a pandas DataFrame,
            otherwise use generic labels "X_0", "X_1", etc.
        """
        if predictor_labels is None:
            if isinstance(X, pd.DataFrame):
                self.predictor_labels = X.columns.tolist()
            elif isinstance(X, pd.Series):
                self.predictor_labels = X.index.tolist()
            else:
                self.predictor_labels = [f"X_{i}" for i in range(X.shape[1])]
        elif not isinstance(predictor_labels, Iterable):
            raise TypeError("'predictor_labels' must be an iterable if passed")
        elif len(predictor_labels) != X.shape[1]:
            raise ValueError(
                f"'predictor_labels' must contain {X.shape[1]} labels"
            )
        else:
            self.predictor_labels = predictor_labels

        X = np.asarray(X)
        y = np.asarray(y).squeeze()

        if X.shape[0] != y.shape[0]:
            raise ValueError(
                "X and y must have the same number of rows (observations)"
            )
        if y.ndim != 1:
            raise ValueError("y must be a n x 1 vector of observed responses")

        if ensure_intercept and not (X[:, 0] == 1).all():
            self.X_ = np.hstack((np.ones((X.shape[0], 1)), X))
            self.predictor_labels.insert(0, "Intercept")
        else:
            self.X_ = X

        self.y_ = y
        self.n_obs = X.shape[0]
        self.n_predictors = self.X_.shape[1]    # p includes intercept
        self.df_model = self.n_predictors - 1
        self.df_error = self.n_obs - self.n_predictors
        self.fitted = False

    def fit(self):
        """
        Fit the model with the data provided at initialization. Both
        returns the fitted model and updates it in place (a la
        scikit-learn estimators).

        Returns
        -------
        GLM
            The fitted GLM instance
        """
        gram_inv = np.linalg.inv(self.X_.T @ self.X_)
        # beta-hat, shape: (self.n_predictors,)
        self.betas = gram_inv @ self.X_.T @ self.y_
        # y-hat, shape: (self.n_obs,)
        # also alias y_hat as y_pred (standard scikit-learn convention)
        self.y_hat = self.y_pred = self.X_ @ self.betas
        # residuals, shape: (self.n_obs,)
        self.residuals = self.y_ - self.y_hat
        self.ss_res = (self.residuals ** 2).sum()
        self.ss_tot = ((self.y_ - self.y_.mean()) ** 2).sum()
        self.R2 = 1 - self.ss_res / self.ss_tot
        self.AIC = self.n_obs * np.log(self.ss_res / self.n_obs) + 2 * self.n_predictors
        # variance/covariance matrix for betas,
        # shape: (self.n_predictors, self.n_predictors)
        self.beta_cov = self.ss_res / self.df_error * gram_inv
        # standard errors for betas, shape: (self.n_predictors,)
        self.beta_ses = np.diag(self.beta_cov) ** 0.5
        # t-statistics for betas, shape: (self.n_predictors,)
        self.beta_ts = self.betas / self.beta_ses
        # (2-tailed) p-values for t-tests for betas,
        # shape: (self.n_predictors,)
        self.beta_ps = 2 * stats.t.sf(np.abs(self.beta_ts), self.df_error)
        # set fitted flag to True to enable other methods that require
        # these computations first
        self.fitted = True
        return self

    @requires_fit
    def predict(self, X):
        """
        Predict the value of the response variable for new data.

        Parameters
        ----------
        X : array-like
            Array containing with the same number of predictors as the
            design matrix used to fit the model. May contain any number
            of observations.

        Returns
        -------
        y_pred : 1D array
            Predicted values of the response variable for each row of X.

        """
        X = np.atleast_2d(np.asarray(X))
        if X.shape[1] != self.n_predictors:
            raise ValueError(
                f"X must have {self.n_predictors} columns (predictors)"
            )
        return X @ self.betas

    @requires_fit
    def summary(self):
        """Print a formatted summary table of model results."""
        print("General Linear Model")
        print("===================")
        print(f"Number of observations: {self.n_obs}")
        print(f"Number of predictors: {self.n_predictors}")
        print(f"Model degrees of freedom: {self.df_model}")
        print(f"Error degrees of freedom: {self.df_error}")
        print(f"Residual standard error: {self.ss_res:.3f}")
        print(f"R^2: {self.R2:.3f}")
        print(f"AIC: {self.AIC:.3f}")
        print("\nCoefficients:")
        print("-------------")
        # Format the table width to fit the longest predictor name
        # (min width: 4 cols)
        name_col_width = max(len(max(self.predictor_labels, key=len)), 4)
        header = "\033[1m{:<{width}}{:>8}{:>8}{:>8}{:>8}\033[0m".format(
            "Name", "B̂", "SE", "t", "p", width=name_col_width
        )
        print(header)
        for i in range(self.n_predictors):
            label = self.predictor_labels[i]
            beta = self.betas[i]
            se = self.beta_ses[i]
            t = self.beta_ts[i]
            p = self.beta_ps[i]
            if p < 0.001:
                sig = '***'
            elif p < 0.01:
                sig = '**'
            elif p < 0.05:
                sig = '*'
            else:
                sig = ''
            print("{:<{width}}{:>8.3f}{:>8.3f}{:>8.3f}{:>8.3f} \033[1m{sig}\033[0m".format(
                label, beta, se, t, p, width=name_col_width, sig=sig)
            )

    @requires_fit
    def t_test(self, predictor):
        """
        Two-tailed t-test for the null hypothesis that the estimated
        coefficient for a given predictor is not significantly
        different from zero.

        Parameters
        ----------
        predictor : int or str
            Index or label of predictor for which to run the t-test.
            Note that the 0th predictor refers to the intercept.

        Returns
        -------
        t, p
            t-statistic and associated p-value as a tuple.
        """
        if isinstance(predictor, str):
            predictor = self.predictor_labels.index(predictor)
        elif not isinstance(predictor, int):
            raise TypeError("'predictor' must be an int or str")
        return self.beta_ts[predictor], self.beta_ps[predictor]

    @requires_fit
    def test_contrast(self, contrast):
        """
        Two-tailed t-test for the null hypothesis that the given
        contrast of predictors is not significantly different from zero.

        Parameters
        ----------
        contrast : array-like
            Contrast vector of shape (p,). p is the number of predictors
            (including the intercept).

        Returns
        -------
        t, p
            t-statistic and associated p-value as a tuple.
        """
        contrast = np.asarray(contrast).squeeze()
        if contrast.ndim != 1:
            raise ValueError("'contrast' must be a 1D array")
        if contrast.shape[0] != self.n_predictors:
            raise ValueError(
                f"'contrast' must have {self.n_predictors} elements"
            )
        # compute t-statistic
        # note: contrast matrices not transposed here because numpy
        # automatically does so for 1D vectors
        t_stat = (
                (contrast @ self.betas) /
                np.sqrt(contrast @ self.beta_cov @ contrast)
        )
        # compute (2-tailed) p-value
        p_value = 2 * stats.t.sf(np.abs(t_stat), self.df_error)
        return t_stat, p_value

    @requires_fit
    def f_test(self, predictors=None):
        """
        F-test for the null hypothesis that the given (set of)
        predictor(s) has no effect on the response variable (i.e., that
        the quality of the model fit would not significantly worsen if
        the given predictor(s) were removed).

        Parameters
        ----------
        predictors : int, str, or iterable of int or str, optional
            index/indices or label(s) of predictor(s) for which to run
            the F-test. Note that the 0th predictor refers to the
            intercept. If None (default), the F-test is run on all
            predictors (i.e., an intercept-only model).

        Returns
        -------
        F, p: tuple of float
            F-statistic and associated p-value as a tuple.
        """
        if predictors is None:
            predictors = predictors = list(range(1, self.n_predictors))
        elif isinstance(predictors, int):
            predictors = [predictors]
        elif isinstance(predictors, str):
            predictors = [self.predictor_labels.index(predictors)]
        elif isinstance(predictors, Iterable):
            _predictors = []
            for p in predictors:
                if isinstance(p, str):
                    _predictors.append(self.predictor_labels.index(p))
                elif isinstance(p, int):
                    _predictors.append(p)
                else:
                    raise TypeError(
                        "'predictors' must be an int, str, or iterable of int "
                        "or str"
                    )
            predictors = _predictors
        if not all(p in range(self.n_predictors) for p in predictors):
            raise ValueError("'predictors' must be a valid predictor index or "
                             "label"
                             )
        if len(predictors) == 0:
            raise ValueError("'predictors' must contain at least one index or "
                             "label"
                             )
        # get sum of squared residuals for reduced model
        X_reduced = np.delete(self.X_, predictors, axis=1)
        betas_reduced = np.linalg.inv(X_reduced.T @ X_reduced) @ X_reduced.T @ self.y_
        residuals_reduced = self.y_ - X_reduced @ betas_reduced
        ssr_reduced = (residuals_reduced ** 2).sum()
        # compute F-statistic
        ssr_full = self.ss_res
        p = self.n_predictors
        q = len(predictors)
        n = self.n_obs
        f_stat = ((ssr_reduced - ssr_full) / (p - q)) / (ssr_full / (n - p))
        # compute p-value
        p_value = stats.f.sf(f_stat, p - q, n - p)
        return f_stat, p_value

Load in (and clean) some data to test it out!#

# load dataset from GitHub
dataset_url = (
    "https://github.com/torwager/ComputationalFoundations/raw/"
    "79b92a3/CompFound/datasets/Ashar2022_outcomes_demographics.csv"
)
df_raw = pd.read_csv(dataset_url, na_values={'current_opioid_use': '<undefined>'})

# drop individuals that don't have data for both sessions
dupes_only = df_raw.loc[df_raw.duplicated('id', keep=False)]

# make sure each individual's data actually comes from 2 different 
# sessions (i.e., no duplicates or mislabels)
for i in dupes_only['id'].unique():
    assert set(dupes_only.query('id == @i')['time']) == {1, 2}

# reshape to have one row per participant with data from both sessions
# pivot on columns with session-agnostic values (age, gender, etc.)
same_val_cols = (
    ['id', 'group'] + 
    list(dupes_only.columns[list(dupes_only.columns).index('education'):])
)
df = dupes_only.pivot(index=same_val_cols, columns='time')
df.columns = df.columns.map(
    lambda x: f'{x[0]}_{"before" if x[1] == 1 else "after"}'
)
df = df.reset_index()

# deal with columns for data that wasn't collected in both sessions
all_nan_cols = df.columns[df.isnull().sum(axis=0) == len(df)]
df = df.drop(columns=all_nan_cols)
for colname in all_nan_cols:
    base_colname, _, suffix = colname.rpartition('_')
    other_suffix = 'before' if suffix == 'after' else 'after'
    df = df.rename(columns={f'{base_colname}_{other_suffix}': base_colname})
    
# add column for change in pain level
# (this will be the response variable for the model)
df['bpi_intensity_change'] = df['bpi_intensity_before'] - df['bpi_intensity_after']
# display dataset
df

	id	group	education	ethnicity	hispanic	employment_status	exercise	handedness	sses	married_or_living_as_marri	...	sopa_emo_after	pgic	tx_satisfaction	alcohol_before	alcohol_after	opioid_before	opioid_after	cannabis_before	cannabis_after	bpi_intensity_change
0	12	3.0	2.0	4.0	0.0	1.0	3.0	1.0	4.0	0.0	...	8.0	1.0	52.0	0.0	0.0	0.0	0.0	2.000	0.000	-0.25
1	14	3.0	3.0	4.0	0.0	3.0	4.0	1.0	9.0	0.0	...	6.0	1.0	30.5	23.0	21.0	0.0	0.0	3.500	3.500	0.50
2	15	3.0	2.0	5.0	0.0	1.0	3.0	1.0	6.0	1.0	...	4.0	1.0	50.5	15.0	29.0	0.0	0.0	0.000	0.000	-0.50
3	18	2.0	3.0	4.0	0.0	1.0	3.0	1.0	5.0	1.0	...	9.0	5.0	50.0	0.0	2.0	0.0	0.0	0.000	0.000	2.25
4	23	2.0	3.0	2.0	0.0	1.0	5.0	1.0	6.0	0.0	...	8.0	1.0	17.5	29.0	13.5	0.0	0.0	0.015	0.025	0.25
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
130	1268	2.0	3.0	4.0	0.0	2.0	4.0	1.0	8.0	1.0	...	8.0	5.0	88.0	2.0	1.0	0.0	0.0	0.000	0.000	1.75
131	1277	2.0	3.0	4.0	0.0	1.0	3.0	1.0	8.0	1.0	...	10.0	5.0	46.0	23.0	30.0	0.0	0.0	0.000	0.000	-0.75
132	1294	2.0	3.0	4.0	0.0	1.0	3.0	1.0	6.0	0.0	...	8.0	2.0	54.0	0.0	0.0	0.0	0.0	0.000	0.000	0.00
133	1302	3.0	3.0	4.0	0.0	1.0	2.0	1.0	8.0	0.0	...	6.0	2.0	64.0	3.0	3.0	0.0	0.0	0.000	0.000	0.00
134	1319	2.0	3.0	5.0	0.0	1.0	3.0	1.0	5.0	0.0	...	5.0	5.0	69.0	11.0	3.0	2.0	1.0	7.000	19.000	0.75

135 rows × 51 columns

select a subset of variables to use as predictors#

predictors = [
    'group', 'education', 'ethnicity', 'hispanic', 'employment_status', 
    'exercise', 'handedness', 'sses', 'married_or_living_as_marri', 'age', 
    'weight', 'gender', 'backpain_length'
]
X = df[predictors]
y = df['bpi_intensity_change']

fit the model and display results#

glm = GLM(X, y).fit()
glm.summary()

General Linear Model
===================
Number of observations: 135
Number of predictors: 14
Model degrees of freedom: 13
Error degrees of freedom: 121
Residual standard error: 193.674
R^2: 0.399
AIC: 76.722

Coefficients:
-------------
Name                            B̂      SE       t       p
Intercept                    5.284   1.641   3.220   0.002 **
group                       -1.054   0.137  -7.670   0.000 ***
education                   -0.543   0.278  -1.955   0.053 
ethnicity                    0.058   0.219   0.264   0.792 
hispanic                    -0.373   0.716  -0.522   0.603 
employment_status           -0.298   0.147  -2.019   0.046 *
exercise                    -0.121   0.124  -0.975   0.331 
handedness                   0.462   0.240   1.924   0.057 
sses                        -0.047   0.068  -0.681   0.497 
married_or_living_as_marri   0.119   0.254   0.466   0.642 
age                          0.009   0.009   1.033   0.304 
weight                      -0.003   0.004  -0.881   0.380 
gender                       0.253   0.258   0.981   0.329 
backpain_length             -0.018   0.014  -1.270   0.206 

run some individual t-tests#

print('education')
t, p = glm.t_test('education')
print(f"\t{t = }, {p = }")

print('employment_status')
t, p = glm.t_test('employment_status')
print(f"\t{t = }, {p = }")

education
	t = -1.9546532038088111, p = 0.05293030529967579
employment_status
	t = -2.0186211253690964, p = 0.04573738199608248

run some F-tests on a few subsets of predictors#

print('intercept-only model')
F, p = glm.f_test()
print(f"\t{F = }, {p = }\n")

print('["group", "employment_status", "age"]')
F, p = glm.f_test(["employment_status", "age"])
print(f"\t{F = }, {p = }")

intercept-only model
	F = 80.22027070929019, p = 4.903937633539385e-15

["group", "employment_status", "age"]
	F = 0.36459164573984326, p = 0.9733056577039912

test some contrasts#

group_vs_education = np.zeros(glm.n_predictors, dtype=int)
group_vs_education[[1, 2]] = [1, -1]
print(f"{group_vs_education = }")
t, p = glm.test_contrast(group_vs_education)
print(f"\t{t = }, {p = }\n")

ethnicity_vs_employment_status = np.zeros(glm.n_predictors, dtype=int)
ethnicity_vs_employment_status[[3, 5]] = [1, -1]
print(f"{ethnicity_vs_employment_status = }")
t, p = glm.test_contrast(ethnicity_vs_employment_status)
print(f"\t{t = }, {p = }")

group_vs_education = array([ 0,  1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
	t = -1.7110376121312285, p = 0.0896365856306869

ethnicity_vs_employment_status = array([ 0,  0,  0,  1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0])
	t = 1.382285772613835, p = 0.16943021507691455

predict response variable for some (fake) new data#

np.random.seed(0)
new_data = np.random.rand(3, glm.X_.shape[1])
glm.predict(new_data)

array([1.99317962, 0.11215594, 2.29967943])

Load packages

Contents

Load packages#

Implement a GLM from scratch#

Load in (and clean) some data to test it out!#

select a subset of variables to use as predictors#

fit the model and display results#

run some individual t-tests#

run some F-tests on a few subsets of predictors#

test some contrasts#

predict response variable for some (fake) new data#