Assumptions of Simple Linear Regression

Lucy D’Agostino McGowan

Linearity

Linearity

What goes wrong if the relationship between \(x\) and \(y\) isn’t linear?

Code
set.seed(1)

d1 <- tibble(x = rnorm(100),
             y = 2 * x + rnorm(100))
ggplot(d1, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
m1 <- lm(y ~ x, data = d1)

\(\hat\beta_1\) = 2 (95% CI: 1.79, 2.21)

Code
d2 <- tibble(x = rnorm(100),
             y = rnorm(100))
ggplot(d2, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
m2 <- lm(y ~ x, data = d2)

\(\hat\beta_1\) = 0.11 (95% CI: -0.08, 0.3)

Code
d3 <- tibble(x = rnorm(100),
             y = 3 * x^2 + rnorm(100))
ggplot(d3, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
m3 <- lm(y ~ x, data = d3)

\(\hat\beta_1\) = 0.92 (95% CI: -0.15, 1.98)

Linearity

  • Why is it important?
  • How do you check it?
    • Visualize your data
    • Scatterplot of \(x\) and \(y\)
    • Residuals vs fits plot

Residuals vs fits plot

How can I add the fitted values \((\hat{y})\) to my data frame?

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(y_hat = fitted(mod))

Residuals vs fits plot

How can I add the residual values \((e)\) to my data frame?

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(y_hat = fitted(mod),
         e = y - y_hat)

Residuals vs fits plot

How can I add the residual values \((e)\) to my data frame?

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(y_hat = fitted(mod),
         e = residuals(mod))

Residuals vs fits plot

How can I create a scatterplot of the residuals vs the fitted values?

ggplot(d1, aes(x = ----, y = ----)) +
  geom_---() + 
  geom_hline(yintercept = 0) +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

How can I create a scatterplot of the residuals vs the fitted values?

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point() + 
  geom_hline(yintercept = 0) +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

Code
ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code
d2 <- d2 %>%
  mutate(y_hat = fitted(m2),
         e = residuals(m2))
ggplot(d2, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code
d3 <- d3 %>%
  mutate(y_hat = fitted(m3),
         e = residuals(m3))
ggplot(d3, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

The residuals have a mean of zero

Zero mean

Code
ggplot(d1, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_segment(aes(
    x = x,
    y = y,
    xend = x,
    yend = fitted(m1)
  ),
  color = "blue") +
  geom_smooth(
    method = "lm",
    formula = "y ~ x",
    se = FALSE,
    color = "#86a293"
  )

\(\sum e_i\) = 0

Code
ggplot(d2, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
    geom_segment(aes(
    x = x,
    y = y,
    xend = x,
    yend = fitted(m2)
  ),
  color = "blue") +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

\(\sum e_i\) = 0

Code
ggplot(d3, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_segment(aes(
    x = x,
    y = y,
    xend = x,
    yend = fitted(m3)
  ),
  color = "blue") +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

\(\sum e_i = 0\)

Zero mean

  • Why is it important?
  • How do you check it?
    • It will always be true for simple linear regression fit with least squares (lm in R)
    • If you would like to check anyways, here is some R code:
mod <- lm(y ~ x, data = d1)
d1 %>%
  mutate(e = residuals(mod)) %>%
  summarize(e = sum(e))
# A tibble: 1 × 1
          e
      <dbl>
1 -1.32e-16

Constant variance

Constant variance

  • The variance of the outcome \((y)\) does not change as the predictor \((x)\) changes
  • How spread out the data points are is “constant”
  • Can be assessed with the “Residual vs fits” plot

Residuals vs fits plot

How can I create a scatterplot of the residuals vs the fitted values?

ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point() + 
  geom_hline(yintercept = 0) +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

Code
ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code
ggplot(d2, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code
ggplot(d3, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Residuals vs fits plot

Code
ggplot(d1, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code
d4 <- tibble(x = rnorm(100),
             y = 2 * x + x / 2 * rnorm(100, sd = 10))
m4 <- lm(y ~ x, data = d4)
d4 <- d4 %>%
  mutate(y_hat = fitted(m4),
         e = residuals(m4))
ggplot(d4, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Code
d5 <- tibble(x = runif(100, max = 10),
             y = x * rnorm(100, sd = 10))
m5 <- lm(y ~ x, data = d5)
d5 <- d5 %>%
  mutate(y_hat = fitted(m5),
         e = residuals(m5))
ggplot(d5, aes(x = y_hat, y = e)) +
  geom_point(color = "#86a293") + 
  geom_hline(yintercept = 0, color = "#86a293") +
  labs(x = "Fitted value",
       y = "Residual")

Constant variance

  • Why is it important?
    • It turns out the simple linear regression estimator fit via least squares for the relationship between \(x\) and \(y\) has the narrowest standard errors of all estimation procedures that give sampling distributions centered at the true \(\beta\). If this assumption is violated, this is no longer true.
    • When calculating prediction intervals for a single observations, we need this assumption to make sure we are using the right variance
    • If you take STA 312, you will prove this and you will need this assumption to do so
  • How do you check it?
    • Residuals vs fits plot

Normally distributed errors

Normally distributed errors

  • We assume the errors \((\varepsilon)\) follow a normal distribution
  • This allows us to use short cuts when calculating the confidence intervals / doing hypothesis testing
  • We can check this assumption by examining the distribution of the residuals

What kind of plot could help us assess whether a variable’s distribution is Normal?

Histogram

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(e = ---(mod))

ggplot(d1, aes(x = ---)) + 
  geom_----

Histogram

mod <- lm(y ~ x, data = d1)
d1 <- d1 %>%
  mutate(e = residuals(mod))

ggplot(d1, aes(x = e)) + 
  geom_histogram(bins = 8)

Histogram

Code
ggplot(d1, aes(x = e)) +
  geom_histogram(fill = "#86a293", bins = 20)

Code
ggplot(d2, aes(x = e)) +
  geom_histogram(fill = "#86a293", bins = 20)

Code
ggplot(d3, aes(x = e)) +
  geom_histogram(fill = "#86a293", bins = 20)

Normal Quantile (Q-Q) plot

  • Scatterplot of the ordered observed residuals versus values (the theortical quantiles) that we would expect from a “perfect” normal sample of the same sample size
ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Normal Quantile (Q-Q) plot

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Normal Quantile (Q-Q) plot

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Normal Quantile (Q-Q) plot

ggplot(d1, aes(sample = e)) +
  geom_qq()  + 
  geom_qq_line()

Q-Q plots

Code
ggplot(d1, aes(sample = e)) +
  geom_qq(color = "#86a293") + 
  geom_qq_line(color = "#86a293")

Code
ggplot(d2, aes(sample = e)) +
  geom_qq(color = "#86a293") + 
  geom_qq_line(color = "#86a293")

Code
ggplot(d3, aes(sample = e)) +
  geom_qq(color = "#86a293") + 
  geom_qq_line(color = "#86a293")

Normally distributed errors

  • Why is it important?
    • In very small samples, this is important for our 95% confidence intervals to actually have 95% coverage
    • It is also important for prediction intervals (when making an interval for a single observation)
    • In medium / large samples, it is often ok regardless
  • How do you check it?
    • Visualize the residuals (histogram, Q-Q plot)

Independence & Randomness

Magnolia data

Code
full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Magnolia data

Code
full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")
  • If these were truly sampled randomly and observations were independent, we’d expect that 95% of these confidence intervals would contain the “true” \(\beta_1\)

Magnolia data

Code
full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Are these independent?

Magnolia data

Code
full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = id,
    yend = id,
    color = id == 3
  )) +
  scale_x_continuous(limits = c(-2, 4)) + 
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], 
             lty = 2) +
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Was the sample random?

Coverage probability

Code
avg_slope <- coef(lm(leaf_length ~ leaf_width, data = full_magnolia_data))[2]
full_magnolia_data %>%
  nest_by(id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ungroup() %>%
  summarise(coverage = mean(
    conf.low < avg_slope &
      conf.high > avg_slope
  ))
# A tibble: 1 × 1
  coverage
     <dbl>
1    0.611

Magnolia data

Let’s actually randomly sample from your magnolia data

set.seed(928)
full_magnolia_data <- full_magnolia_data %>%
  mutate(random_id = sample(1:44, size = n(), replace = TRUE))

Magnolia data

Code
full_magnolia_data %>%
  nest_by(random_id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ggplot() +
  geom_segment(aes(
    x = conf.low,
    xend = conf.high,
    y = random_id,
    yend = random_id,
    color = random_id == 3
  )) +
  geom_vline(xintercept = coef(lm(leaf_length ~ leaf_width, full_magnolia_data))[2], lty = 2) +
  scale_x_continuous(limits = c(-2, 4)) + 
  scale_color_manual(values = c("black", "cornflower blue")) +
  labs(x = "Relationship between leaf length and leaf width (Slope)",
       y = "Student") +
  theme(legend.position = "none")

Coverage probability

Code
full_magnolia_data %>%
  nest_by(random_id) %>%
  mutate(mod = list(lm(leaf_length ~ leaf_width, data = data))) %>%
  summarise(broom::tidy(mod, conf.int = TRUE)) %>%
  filter(term == "leaf_width") %>%
  ungroup() %>%
  summarise(coverage = mean(
    conf.low < avg_slope &
      conf.high > avg_slope
  ))
# A tibble: 1 × 1
  coverage
     <dbl>
1    0.909

Independence

  • Why is it important?
  • How can we check it?
    • If you only have one data sample, there is not a check, you need to know about the data generation / sampling process.