Assumptions of Simple Linear Regression

Lucy D’Agostino McGowan

Linearity

What goes wrong if the relationship between \(x\) and \(y\) isn’t linear?

Code

set.seed(1)

d1 <- tibble(x = rnorm(100),
             y = 2 * x + rnorm(100))
ggplot(d1, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code

m1 <- lm(y ~ x, data = d1)

\(\hat\beta_1\) = 2 (95% CI: 1.79, 2.21)

Code

d2 <- tibble(x = rnorm(100),
             y = rnorm(100))
ggplot(d2, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code

m2 <- lm(y ~ x, data = d2)

\(\hat\beta_1\) = 0.11 (95% CI: -0.08, 0.3)

Code

d3 <- tibble(x = rnorm(100),
             y = 3 * x^2 + rnorm(100))
ggplot(d3, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code

m3 <- lm(y ~ x, data = d3)

\(\hat\beta_1\) = 0.92 (95% CI: -0.15, 1.98)

Linearity

Why is it important?
How do you check it?
- Visualize your data
- Scatterplot of \(x\) and \(y\)
- Residuals vs fits plot

Residuals vs fits plot

How can I add the fitted values \((\hat{y})\) to my data frame?

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(y_hat = fitted(mod))

Residuals vs fits plot

How can I add the residual values \((e)\) to my data frame?

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(y_hat = fitted(mod),
         e = y - y_hat)

Residuals vs fits plot

How can I add the residual values \((e)\) to my data frame?

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(y_hat = fitted(mod),
         e = residuals(mod))

Residuals vs fits plot

How can I create a scatterplot of the residuals vs the fitted values?

ggplot(d1, aes(x = ----, y = ----)) +
  geom_---() + 
  geom_hline(yintercept = 0) +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

How can I create a scatterplot of the residuals vs the fitted values?

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point() + 
  geom_hline(yintercept = 0) +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

Code

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code

d2 <- d2 %>%
  mutate(y_hat = fitted(m2),
         e = residuals(m2))
ggplot(d2, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code

d3 <- d3 %>%
  mutate(y_hat = fitted(m3),
         e = residuals(m3))
ggplot(d3, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

The residuals have a mean of zero

Zero mean

Code

ggplot(d1, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_segment(aes(
    x = x,
    y = y,
    xend = x,
    yend = fitted(m1)
  ),
  color = "blue") +
  geom_smooth(
    method = "lm",
    formula = "y ~ x",
    se = FALSE,
    color = "#86a293"
  )

\(\sum e_i\) = 0

Code

ggplot(d2, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
    geom_segment(aes(
    x = x,
    y = y,
    xend = x,
    yend = fitted(m2)
  ),
  color = "blue") +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

\(\sum e_i\) = 0

Code

ggplot(d3, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_segment(aes(
    x = x,
    y = y,
    xend = x,
    yend = fitted(m3)
  ),
  color = "blue") +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

\(\sum e_i = 0\)

Zero mean

Why is it important?
How do you check it?
- It will always be true for simple linear regression fit with least squares (lm in R)
- If you would like to check anyways, here is some R code:

mod <- lm(y ~ x, data = d1)
d1 %>%
  mutate(e = residuals(mod)) %>%
  summarize(e = sum(e))

# A tibble: 1 × 1
          e
      <dbl>
1 -1.32e-16

Constant variance

The variance of the outcome \((y)\) does not change as the predictor \((x)\) changes
How spread out the data points are is “constant”
Can be assessed with the “Residual vs fits” plot

Residuals vs fits plot

How can I create a scatterplot of the residuals vs the fitted values?

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point() + 
  geom_hline(yintercept = 0) +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

Code

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code

ggplot(d2, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code

ggplot(d3, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

Code

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code

d4 <- tibble(x = rnorm(100),
             y = 2 * x + x / 2 * rnorm(100, sd = 10))
m4 <- lm(y ~ x, data = d4)
d4 <- d4 %>%
  mutate(y_hat = fitted(m4),
         e = residuals(m4))
ggplot(d4, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code

d5 <- tibble(x = runif(100, max = 10),
             y = x * rnorm(100, sd = 10))
m5 <- lm(y ~ x, data = d5)
d5 <- d5 %>%
  mutate(y_hat = fitted(m5),
         e = residuals(m5))
ggplot(d5, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Constant variance

Why is it important?
- It turns out the simple linear regression estimator fit via least squares for the relationship between \(x\) and \(y\) has the narrowest standard errors of all estimation procedures that give sampling distributions centered at the true \(\beta\). If this assumption is violated, this is no longer true.
- When calculating prediction intervals for a single observations, we need this assumption to make sure we are using the right variance
- If you take STA 312, you will prove this and you will need this assumption to do so
How do you check it?
- Residuals vs fits plot

Normally distributed errors

We assume the errors \((\varepsilon)\) follow a normal distribution
This allows us to use short cuts when calculating the confidence intervals / doing hypothesis testing
We can check this assumption by examining the distribution of the residuals

What kind of plot could help us assess whether a variable’s distribution is Normal?

Histogram

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(e = ---(mod))

ggplot(d1, aes(x = ---)) + 
  geom_----

Histogram

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(e = residuals(mod))

ggplot(d1, aes(x = e)) + 
  geom_histogram(bins = 8)

Histogram

Code

ggplot(d1, aes(x = e)) +
  geom_histogram(fill = "#86a293", bins = 20)

Code

ggplot(d2, aes(x = e)) +
  geom_histogram(fill = "#86a293", bins = 20)

Code

ggplot(d3, aes(x = e)) +
  geom_histogram(fill = "#86a293", bins = 20)

Normal Quantile (Q-Q) plot

Scatterplot of the ordered observed residuals versus values (the theortical quantiles) that we would expect from a “perfect” normal sample of the same sample size

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Normal Quantile (Q-Q) plot

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Normal Quantile (Q-Q) plot

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Normal Quantile (Q-Q) plot

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Q-Q plots

Code

ggplot(d1, aes(sample = e)) +
  geom_qq(color = "#86a293") + 
  geom_qq_line(color = "#86a293")

Code

ggplot(d2, aes(sample = e)) +
  geom_qq(color = "#86a293") + 
  geom_qq_line(color = "#86a293")

Code

ggplot(d3, aes(sample = e)) +
  geom_qq(color = "#86a293") + 
  geom_qq_line(color = "#86a293")

Normally distributed errors

Why is it important?
- In very small samples, this is important for our 95% confidence intervals to actually have 95% coverage
- It is also important for prediction intervals (when making an interval for a single observation)
- In medium / large samples, it is often ok regardless
How do you check it?
- Visualize the residuals (histogram, Q-Q plot)

Independence & Randomness

Magnolia data

Code

full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Magnolia data

Code

full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

If these were truly sampled randomly and observations were independent, we’d expect that 95% of these confidence intervals would contain the “true” \(\beta_1\)

Magnolia data

Code

full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Are these independent?

Magnolia data

Code

full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  scale_x_continuous(limits = c(-2, 4)) + 
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], 
             lty = 2) +
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Was the sample random?

Coverage probability

Code

avg_slope <- coef(lm(leaf_length ~ leaf_width, data = full_magnolia_data))[2]
full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ungroup() %>%
  summarise(coverage = mean(
    conf.low < avg_slope &
      conf.high > avg_slope
  ))

# A tibble: 1 × 1
  coverage
     <dbl>
1    0.611

Magnolia data

Let’s actually randomly sample from your magnolia data

set.seed(928)
full_magnolia_data <- full_magnolia_data %>%
  mutate(random_id = sample(1:44, size = n(), replace = TRUE))

Magnolia data

Code

full_magnolia_data %>%
  nest_by(random_id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = random_id,
    yend = random_id,
    color = random_id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Coverage probability

Code

full_magnolia_data %>%
  nest_by(random_id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ungroup() %>%
  summarise(coverage = mean(
    conf.low < avg_slope &
      conf.high > avg_slope
  ))

# A tibble: 1 × 1
  coverage
     <dbl>
1    0.909

Independence

Why is it important?
How can we check it?
- If you only have one data sample, there is not a check, you need to know about the data generation / sampling process.