Simple Linear Regression

Lucy D’Agostino McGowan

Relationship between two continuous variables

Visualizing two continuous variables

How can we visualize the relationship between two continuous variables?

Code
starwars_nojabba <- starwars %>%
  drop_na(height, mass) %>%
  filter(mass < 1000)
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_point(color = "#86a293")

Visualizing two continuous variables

If we know a Star Wars character’s height, can we guess how much they weigh?

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_point(color = "#86a293")

Data = model + error

\(y = f(x) + \varepsilon\)

Simple linear regression

\[\Large\mathbf{y} = f(\mathbf{x}) + \varepsilon\]

Simple linear regression

\[\Large{\require{color}\colorbox{#86a293}{$\mathbf{y}$}} = f(\mathbf{x}) + \varepsilon\]

continuous (dependent) variable

Simple linear regression

\[\Large\mathbf{y} = f({\require{color}\colorbox{#86a293}{$\mathbf{x}$}}) + \varepsilon\]

continuous (independent) variable

Simple linear regression

\[\Large\mathbf{y} = {\require{color}\colorbox{#86a293}{$f(\mathbf{x})$}} + \varepsilon\]

a function that gives the mean value of \(y\) at any value of \(x\)

Simple linear regression

What function do you think we are using to get the mean value of \(y\) with simple linear regression?

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_point(color = "#86a293") +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

We express the mean mass of the Star Wars characters as a linear function of their height

Simple linear regression

What is the equation that represents this line?

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

Simple linear regression

\[\Large y = \beta_0 + \beta_1 x\]

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

Simple linear regression

\[\Large \textrm{mass} = \beta_0 + \beta_1 \times \textrm{height}\]

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

Simple linear regression

What is \(\beta_0\)?

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

Simple linear regression

What is \(\beta_0\)?

Code
library(geomtextpath)

ggplot(starwars_nojabba, aes(height, mass)) +
  geom_abline(
    intercept = -32.54,
    slope = 0.62,
    color = "#86a293"
  ) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_texthline(
    yintercept = -32.54,
    lty = 2,
    lwd = 6,
    label = as.character(expression(beta[0])),
    parse = TRUE,
    hjust = 0.1
  ) +
  scale_x_continuous(limits = c(-5, 100)) +
  scale_y_continuous(limits = c(-40, 100)) +
  geom_point(aes(y = -32.54, x = 0), color = "red", size = 3) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption =  "Data from SWAPI (swapi.dev)")

Simple linear regression

Is this meaningful in this dataset?

Code
ggplot(starwars_nojabba, aes(height, mass)) +
  geom_abline(
    intercept = -32.54,
    slope = 0.62,
    color = "#86a293"
  ) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_texthline(
    yintercept = -32.54,
    lty = 2,
    lwd = 6,
    label = as.character(expression(beta[0])),
    parse = TRUE,
    hjust = 0.1
  ) +
  scale_x_continuous(limits = c(-5, 100)) +
  scale_y_continuous(limits = c(-40, 100)) +
  geom_point(aes(y = -32.54, x = 0), color = "red", size = 3) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption =  "Data from SWAPI (swapi.dev)")

Simple linear regression

What is \(\beta_1\)?

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_abline(
    intercept = -32.54,
    slope = 0.62,
    color = "#86a293"
  )  +
  geom_segment(aes(x = 0, xend = 1, y = -32.54, yend = -32.54), lty = 2) +
  scale_x_continuous(limits = c(-5, 5), breaks = -5:5) +
  scale_y_continuous(limits = c(-35, -30)) + 
  annotate(
    "textsegment",
    x = 1, 
    xend = 1,
    y = -32.54,
    yend = -32.54 + 0.62,
    label = as.character(expression(beta[1])),
    parse = TRUE,
    angle = 0
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

Do all of the points fall exactly on the line?

\(y = \beta_0 + \beta_1 x + \color{blue}{\varepsilon}\)

Simple linear regression

\[y = \beta_0 + \beta_1 x + \color{blue}{\varepsilon}\]

Code
ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_point(color = "#86a293") +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

Simple linear regression

\[y = \beta_0 + \beta_1 x + \color{blue}{\varepsilon}\]

Code
starwars_nojabba <- starwars_nojabba %>%
  mutate(fitted = fitted(lm(mass ~ height, data = starwars_nojabba)))

ggplot(starwars_nojabba, aes(x = height, mass)) +
  geom_point(color = "#86a293") +
  geom_segment(aes(
    x = height,
    y = mass,
    xend = height,
    yend = fitted
  ),
  color = "blue") +
  geom_smooth(
    method = "lm",
    se = FALSE,
    formula = "y ~ x",
    color = "#86a293"
  ) +
  labs(title = "The relationship between mass and height for Star Wars characters",
       caption = "Data from SWAPI (swapi.dev)")

The truth

\[\Large y = \beta_0 + \beta_1 x + \varepsilon\]

  • If we had the whole population of the Star Wars universe we could quantify the exact relationship between \(x\) and \(y\)

The reality

\[\Large \hat{y} = \hat{\beta}_0 + \hat{\beta}_1 x\]

  • In reality, we have a sample of Star Wars characters to estimate the relationship between \(x\) and \(y\) (height and mass).
  • The “hats” represent the estimated (fitted) values

Put a hat on it

How can you tell the difference between a parameter that is from the whole population versus an estimate from the sample?

Let’s pause for definitions

Definitions

  • parameters
  • \(\beta_0\)
  • \(\beta_1\)
  • simple linear model

  • \(\beta_0\), \(\beta_1\)
  • intercept
  • slope
  • \(y = \beta_0 + \beta_1 x + \varepsilon\) estimated by \(\hat{y} = \hat{\beta}_0 + \hat{\beta}_1 x\)

Let’s do this in R

Simple linear regression

library(tidyverse)

starwars_nojabba <- starwars %>%
  filter(mass < 1000)

lm(mass ~ height, data = starwars_nojabba)

Call:
lm(formula = mass ~ height, data = starwars_nojabba)

Coefficients:
(Intercept)       height  
   -32.5408       0.6214  
  • lm = “linear model”

Simple linear regression

library(tidyverse)

starwars_nojabba <- starwars %>%
  filter(mass < 1000)

lm(mass ~ height, data = starwars_nojabba)

Call:
lm(formula = mass ~ height, data = starwars_nojabba)

Coefficients:
(Intercept)       height  
   -32.5408       0.6214  

What is \(\hat\beta_0\)?

Simple linear regression

library(tidyverse)

starwars_nojabba <- starwars %>%
  filter(mass < 1000)

lm(mass ~ height, data = starwars_nojabba)

Call:
lm(formula = mass ~ height, data = starwars_nojabba)

Coefficients:
(Intercept)       height  
   -32.5408       0.6214  

What is \(\hat\beta_1\)?

Interpretation

library(tidyverse)

starwars_nojabba <- starwars %>%
  filter(mass < 1000)

lm(mass ~ height, data = starwars_nojabba)

Call:
lm(formula = mass ~ height, data = starwars_nojabba)

Coefficients:
(Intercept)       height  
   -32.5408       0.6214  

For every one centimeter increase in height, a Star Wars characters weight on average increases by 0.62.

For every one unit increase in \(x\) the expected change in \(y\) is \(\hat\beta_1\)

Two continuous variables

Why do we fit a linear model?

  • Reduces the dimensionality of the data (from n x 2 to 2)
  • To get a sense of the relationship between x and y for a “typical” observation
    • When is this an accurate representation?

Meaningful linear models

Code
set.seed(1)

d1 <- tibble(x = rnorm(100),
             y = 2 * x + rnorm(100))
ggplot(d1, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
d2 <- tibble(x = rnorm(100),
             y = rnorm(100))
ggplot(d2, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
d3 <- tibble(x = rnorm(100),
             y = 3 * x^2 + rnorm(100))
ggplot(d3, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Which lines represent “typical” observations?

Meaningful linear models

Code
d4 <- tibble(x = rnorm(100),
             y = x + x^2 + 4 * x^3 + rnorm(100))
ggplot(d4, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
d5 <- tibble(x = rnorm(100),
             c = rep(c(0, 1), times = c(98, 2)),
             y = x + 100 * c + rnorm(100))
ggplot(d5, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Code
d6 <- tibble(x = rnorm(100),
             y = rnorm(100, c(0, 10)))
ggplot(d6, aes(x = x, y = y)) +
  geom_point(color = "#86a293",
             alpha = 0.5) +
  geom_smooth(method = "lm",
              formula = "y ~ x",
              se = FALSE,
              color = "#86a293")

Which lines represent “typical” observations?

Recap

When is a simple linear model an appropriate summary measure to calculate?

What assumptions need to be true in order to use a simple linear model to represent your two continuous variables?