library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rstanarm) # Bayesian regression using Stan
## Warning: package 'rstanarm' was built under R version 4.3.3
## Loading required package: Rcpp
## This is rstanarm version 2.32.1
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
library(bayesplot) # Visualization for MCMC
## This is bayesplot version 1.14.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(coda) # MCMC diagnostics
## Warning: package 'coda' was built under R version 4.3.3
library(readr)
house_data <- read_csv("kc_house_data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Look at the data
glimpse(house_data)
## Rows: 21,613
## Columns: 21
## $ id <chr> "7129300520", "6414100192", "5631500400", "2487200875", …
## $ date <dttm> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-02…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ zipcode <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, …
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47…
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.0…
## $ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15 <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
summary(house_data)
## id date price
## Length:21613 Min. :2014-05-02 00:00:00.00 Min. : 75000
## Class :character 1st Qu.:2014-07-22 00:00:00.00 1st Qu.: 321950
## Mode :character Median :2014-10-16 00:00:00.00 Median : 450000
## Mean :2014-10-29 04:38:01.96 Mean : 540088
## 3rd Qu.:2015-02-17 00:00:00.00 3rd Qu.: 645000
## Max. :2015-05-27 00:00:00.00 Max. :7700000
## bedrooms bathrooms sqft_living sqft_lot
## Min. : 0.000 Min. :0.000 Min. : 290 Min. : 520
## 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040
## Median : 3.000 Median :2.250 Median : 1910 Median : 7618
## Mean : 3.371 Mean :2.115 Mean : 2080 Mean : 15107
## 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688
## Max. :33.000 Max. :8.000 Max. :13540 Max. :1651359
## floors waterfront view condition
## Min. :1.000 Min. :0.000000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000
## Median :1.500 Median :0.000000 Median :0.0000 Median :3.000
## Mean :1.494 Mean :0.007542 Mean :0.2343 Mean :3.409
## 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000
## Max. :3.500 Max. :1.000000 Max. :4.0000 Max. :5.000
## grade sqft_above sqft_basement yr_built
## Min. : 1.000 Min. : 290 Min. : 0.0 Min. :1900
## 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951
## Median : 7.000 Median :1560 Median : 0.0 Median :1975
## Mean : 7.657 Mean :1788 Mean : 291.5 Mean :1971
## 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997
## Max. :13.000 Max. :9410 Max. :4820.0 Max. :2015
## yr_renovated zipcode lat long
## Min. : 0.0 Min. :98001 Min. :47.16 Min. :-122.5
## 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3
## Median : 0.0 Median :98065 Median :47.57 Median :-122.2
## Mean : 84.4 Mean :98078 Mean :47.56 Mean :-122.2
## 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1
## Max. :2015.0 Max. :98199 Max. :47.78 Max. :-121.3
## sqft_living15 sqft_lot15
## Min. : 399 Min. : 651
## 1st Qu.:1490 1st Qu.: 5100
## Median :1840 Median : 7620
## Mean :1987 Mean : 12768
## 3rd Qu.:2360 3rd Qu.: 10083
## Max. :6210 Max. :871200
# Select relevant predictors for regression
data <- house_data %>%
select(price, sqft_living, bedrooms, bathrooms, grade, condition)
Loads the King County house sales dataset.
glimpse() shows data types and a preview of values.
summary() gives statistics (min, max, median, mean) for each column.
# Scatter plot of price vs sqft_living colored by bedrooms
ggplot(data, aes(x = sqft_living, y = price, color = factor(bedrooms))) +
geom_point(alpha = 0.6) +
labs(title = "King County House Prices",
x = "Square Footage",
y = "Price",
color = "Bedrooms") +
theme_minimal()
Scatter plot of price vs sqft_living.
Points colored by bedrooms.
alpha=0.6 → Makes points slightly transparent.
Output:
Plot showing a positive trend: bigger houses generally cost more.
Can see clusters for 1–11 bedrooms.
# Fit Bayesian model with weakly informative priors
bayes_model <- stan_glm(
price ~ sqft_living + bedrooms + bathrooms + grade + condition,
data = data,
family = gaussian(),
prior = normal(0, 1e4),
prior_intercept = normal(0, 1e5),
chains = 4,
iter = 2000,
seed = 123
)
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 8.5e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.85 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 7.243 seconds (Warm-up)
## Chain 1: 2.282 seconds (Sampling)
## Chain 1: 9.525 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 2.2e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.22 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 2.974 seconds (Warm-up)
## Chain 2: 2.262 seconds (Sampling)
## Chain 2: 5.236 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 1.7e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.17 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 5.677 seconds (Warm-up)
## Chain 3: 2.952 seconds (Sampling)
## Chain 3: 8.629 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 2.8e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.28 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 11.306 seconds (Warm-up)
## Chain 4: 2.723 seconds (Sampling)
## Chain 4: 14.029 seconds (Total)
## Chain 4:
# Summarize posterior estimates
print(summary(bayes_model), digits = 3)
##
## Model Info:
## function: stan_glm
## family: gaussian [identity]
## formula: price ~ sqft_living + bedrooms + bathrooms + grade + condition
## algorithm: sampling
## sample: 4000 (posterior sample size)
## priors: see help('prior_summary')
## observations: 21613
## predictors: 6
##
## Estimates:
## mean sd 10% 50% 90%
## (Intercept) -697452.788 17362.714 -719927.447 -697499.281 -675547.735
## sqft_living 222.461 3.483 217.954 222.479 226.930
## bedrooms -42090.464 2277.659 -45005.879 -42155.233 -39100.964
## bathrooms -15383.457 3308.104 -19672.011 -15437.799 -11129.827
## grade 97291.881 2210.908 94423.150 97278.441 100139.271
## condition 59875.172 2527.083 56630.936 59931.829 63111.916
## sigma 244698.023 1181.456 243176.639 244700.703 246199.166
##
## Fit Diagnostics:
## mean sd 10% 50% 90%
## mean_PPD 539865.287 2350.986 536814.554 539866.532 542876.219
##
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
##
## MCMC diagnostics
## mcse Rhat n_eff
## (Intercept) 314.559 1.000 3047
## sqft_living 0.054 1.000 4159
## bedrooms 39.109 1.000 3392
## bathrooms 54.846 1.000 3638
## grade 39.704 1.000 3101
## condition 42.199 1.000 3586
## sigma 16.610 1.000 5060
## mean_PPD 39.083 0.999 3618
## log-posterior 0.044 1.003 1807
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
price is predicted by sqft_living, bedrooms, bathrooms, grade, condition.
Gaussian family → assumes normal residuals.
Priors are weakly informative (wide), so data mostly drives estimates.
chains=4 and iter=2000 → 4 MCMC chains of 2000 iterations each.
stan_glm() automatically runs MCMC to estimate posterior distributions.
Output:
Internally generates posterior samples for all coefficients and intercept.
These samples are used for predictions and credible intervals.
Posterior summary shows credible intervals → range where the coefficient likely lies with 95% probability.
Rhat near 1 → good convergence.
posterior <- as.matrix(bayes_model)
head(posterior[, 1:5])
## parameters
## iterations (Intercept) sqft_living bedrooms bathrooms grade
## [1,] -701242.1 221.8258 -41868.13 -14677.80 99861.57
## [2,] -690355.0 223.1933 -43865.55 -15721.51 94866.09
## [3,] -686684.4 227.3880 -45736.02 -18430.16 95360.53
## [4,] -700283.8 215.6090 -38317.33 -13613.21 99407.87
## [5,] -697426.7 225.0321 -38982.54 -19958.66 96946.95
## [6,] -691950.1 223.7428 -40809.85 -17063.33 96508.44
mcmc_areas(posterior, pars = c("(Intercept)", "sqft_living", "bedrooms", "bathrooms", "grade", "condition")) +
ggtitle("Posterior Distributions of Coefficients")
# Pairwise relationships
mcmc_pairs(posterior, pars = c("(Intercept)", "sqft_living", "bedrooms", "bathrooms", "grade", "condition"))
## Warning: Only one chain in 'x'. This plot is more useful with multiple chains.
Plots show uncertainty, correlation between coefficients, and posterior
shapes.
Helpful to see if parameters are tightly estimated or very uncertain.
pp_check(bayes_model, plotfun = "dens_overlay") +
ggtitle("Posterior Predictive Check: Observed vs Simulated Data")
Simulates data from posterior and overlays with observed data.
Checks if the model can reproduce real-world data patterns.
posterior_ci <- posterior_interval(bayes_model, prob = 0.95)
posterior_ci
## 2.5% 97.5%
## (Intercept) -730846.985 -663562.649
## sqft_living 215.889 229.366
## bedrooms -46381.655 -37619.115
## bathrooms -21798.137 -8880.444
## grade 92941.810 101510.111
## condition 54859.658 64802.256
## sigma 242436.539 246931.206
new_data <- tibble(
sqft_living = c(2000, 2500),
bedrooms = c(3, 4),
bathrooms = c(2, 3),
grade = c(7, 8),
condition = c(3, 4)
)
pred <- posterior_predict(bayes_model, newdata = new_data)
pred_mean <- apply(pred, 2, mean)
pred_ci <- apply(pred, 2, quantile, probs = c(0.025, 0.975))
tibble(new_data, PredictedPrice = pred_mean, Lower95 = pred_ci[1,], Upper95 = pred_ci[2,])
## # A tibble: 2 × 8
## sqft_living bedrooms bathrooms grade condition PredictedPrice Lower95 Upper95
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2000 3 2 7 3 445065. -46831. 938316.
## 2 2500 4 3 8 4 664465. 193200. 1140481.
Generates predicted house prices for new inputs.
Computes mean prediction and 95% credible intervals.
posterior_predict() samples from posterior → incorporates uncertainty.
Interpretation
For the first house (2000 sq ft, 3 bedrooms, etc.):
Predicted price = PredictedPrice
95% probability that the actual price is between Lower95 and Upper95.
For the second house (2500 sq ft, 4 bedrooms, etc.):
Predicted price = PredictedPrice
95% credible interval = Lower95 to Upper95.
Key point: Unlike classical regression which gives a single point estimate, Bayesian regression gives a range of likely values, capturing uncertainty about the estimate. ## MCMC diagnostics
posterior <- as.matrix(bayes_model) # matrix of posterior samples
library(bayesplot)
mcmc_trace(posterior[, c("(Intercept)", "sqft_living", "bedrooms", "bathrooms", "grade", "condition")])
mcmc_acf(posterior[, c("(Intercept)", "sqft_living", "bedrooms", "bathrooms", "grade", "condition")])