Understanding the fundamental concepts behind TidyDensity will help you use the package effectively.
Tidy data follows three principles:
# Traditional approach (base R)
x <- rnorm(100)
# Just a vector - limited functionality
# TidyDensity approach
data <- tidy_normal(.n = 100)
# A tibble with structure:
# - sim_number: simulation ID
# - x: observation number
# - y: random value
# - dx, dy: density values
# - p: cumulative probability
# - q: quantile
head(data)
#> # A tibble: 6 × 7
#> sim_number x y dx dy p q
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0.235 -3.07 0.000196 0.593 0.235
#> 2 1 2 0.477 -3.00 0.000375 0.683 0.477
#> 3 1 3 -0.174 -2.93 0.000693 0.431 -0.174
#> 4 1 4 -0.0222 -2.86 0.00123 0.491 -0.0222
#> 5 1 5 0.268 -2.79 0.00211 0.606 0.268
#> 6 1 6 1.28 -2.72 0.00348 0.900 1.28A probability distribution describes how values of a random variable are distributed.
Values can take any real number within a range:
# Normal distribution
normal_data <- tidy_normal(.n = 100, .mean = 0, .sd = 1)
# Uniform distribution
uniform_data <- tidy_uniform(.n = 100, .min = 0, .max = 1)
# Visualize both
p1 <- tidy_autoplot(normal_data, .plot_type = "density") +
ggtitle("Normal Distribution")
p2 <- tidy_autoplot(uniform_data, .plot_type = "density") +
ggtitle("Uniform Distribution")
p1 | p2Values can only take specific integers:
# Poisson distribution
poisson_data <- tidy_poisson(.n = 100, .lambda = 5)
# Binomial distribution
binomial_data <- tidy_binomial(.n = 100, .size = 10, .prob = 0.5)
# Visualize both
p1 <- tidy_autoplot(poisson_data, .plot_type = "density") +
ggtitle("Poisson Distribution")
p2 <- tidy_autoplot(binomial_data, .plot_type = "density") +
ggtitle("Binomial Distribution")
p1 | p2Location (Center): - Where the distribution is centered - Examples: mean, median, mode
Scale (Spread): - How spread out the values are - Examples: standard deviation, variance, IQR
Shape: - Form of the distribution - Examples: skewness, kurtosis, modality
# Normal: Symmetric, bell-shaped
normal <- tidy_normal(.n = 100, .mean = 0, .sd = 1)
# Gamma: Right-skewed
gamma <- tidy_gamma(.n = 100, .shape = 2, .scale = 1)
# Uniform: Flat, all values equally likely
uniform <- tidy_uniform(.n = 100, .min = 0, .max = 1)
# Visualize characteristics
p1 <- tidy_autoplot(normal, .plot_type = "density") +
ggtitle("Normal: Symmetric")
p2 <- tidy_autoplot(gamma, .plot_type = "density") +
ggtitle("Gamma: Right-skewed")
p3 <- tidy_autoplot(uniform, .plot_type = "density") +
ggtitle("Uniform: Flat")
p1 | p2 | p3Every probability distribution has four related functions:
Probability Density Function (PDF) for continuous
distributions: - How likely is a specific value? - In
TidyDensity: dy column
Cumulative Distribution Function (CDF): - What’s the
probability of getting a value ≤ x? - In TidyDensity: p
column
data <- tidy_normal(.n = 100, .mean = 0, .sd = 1)
# p column contains cumulative probabilities
# p = 0.5 means 50% of values are below this point
head(data[, c("y", "p")])
#> # A tibble: 6 × 2
#> y p
#> <dbl> <dbl>
#> 1 1.63 0.948
#> 2 0.494 0.689
#> 3 -0.648 0.259
#> 4 0.346 0.635
#> 5 0.590 0.722
#> 6 -1.58 0.0576Inverse of CDF (Quantile Function): - What value
corresponds to a given probability? - In TidyDensity: q
column
Generate random values: - Simulate data from the
distribution - In TidyDensity: y column
data <- tidy_normal(.n = 100, .num_sims = 1)
# Density plot (d function)
p1 <- tidy_autoplot(data, .plot_type = "density") +
ggtitle("Density (d)")
# CDF plot (p function)
p2 <- tidy_autoplot(data, .plot_type = "probability") +
ggtitle("Probability (p)")
# Quantile plot (q function)
p3 <- tidy_autoplot(data, .plot_type = "quantile") +
ggtitle("Quantile (q)")
# Combined view
(p1 | p2) / p3Computer-generated “random” numbers are actually pseudorandom: - Deterministic algorithm - Appears random but reproducible with same seed - Good enough for most applications
Why use multiple simulations?
# Single simulation - might not represent true distribution
single <- tidy_normal(.n = 100, .num_sims = 1)
# Multiple simulations - better understanding of variability
multiple <- tidy_normal(.n = 100, .num_sims = 20)
p1 <- tidy_autoplot(single, .plot_type = "density") +
ggtitle("Single Simulation")
p2 <- tidy_autoplot(multiple, .plot_type = "density") +
ggtitle("20 Simulations")
p1 | p2Use cases: - Assess sampling variability - Monte Carlo simulation - Sensitivity analysis - Uncertainty quantification
Goal: Estimate distribution parameters from observed data
# Observed data
observed <- c(10.2, 9.8, 10.5, 10.1, 9.9)
# Estimate parameters
fit <- util_normal_param_estimate(observed)
# Get estimates
fit$parameter_tbl
#> # A tibble: 2 × 8
#> dist_type samp_size min max method mu stan_dev shape_ratio
#> <chr> <int> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 Gaussian 5 9.8 10.5 EnvStats_MME_MLE 10.1 0.245 41.2
#> 2 Gaussian 5 9.8 10.5 EnvStats_MVUE 10.1 0.274 36.9Concept: Find parameters that maximize probability of observing the data
Characteristics: - Asymptotically efficient - Best for large samples (n > 30) - Most commonly used
Concept: Match sample moments to theoretical moments
Characteristics: - Simpler computation - Often same as MLE for common distributions - Intuitive approach
Concept: Unbiased estimates with minimum variance
Characteristics: - Best for small samples - Corrects for small-sample bias - Theoretically optimal when available
Akaike Information Criterion (AIC): - Balances fit quality with model complexity - Lower AIC = better model - Used to compare distributions
# Generate some data with local seed
data_y <- withr::with_seed(42, rnorm(100, mean = 5, sd = 2))
# Compare multiple distributions
normal_aic <- util_normal_aic(.x = data_y)
cauchy_aic <- util_cauchy_aic(.x = data_y)
logistic_aic <- util_logistic_aic(.x = data_y)
# Show AIC values
cat("Normal AIC:", normal_aic, "\n")
#> Normal AIC: 433.517
cat("Cauchy AIC:", cauchy_aic, "\n")
#> Cauchy AIC: 466.3125
cat("Logistic AIC:", logistic_aic, "\n")
#> Logistic AIC: 433.9747
# Choose distribution with lowest AIC
best_model <- c("Normal", "Cauchy", "Logistic")[which.min(c(normal_aic, cauchy_aic, logistic_aic))]
cat("Best model:", best_model, "\n")
#> Best model: NormalUsing distributions for hypothesis tests:
# Test if sample mean differs from 0
observed_data <- withr::with_seed(456, rnorm(100, mean = 0.5, sd = 1))
# Generate null distribution with local seed
null_dist <- withr::with_seed(789, tidy_normal(.n = 100, .mean = 0, .sd = 1, .num_sims = 1000))
# Calculate test statistic
observed_mean <- mean(observed_data)
# Calculate null means for each simulation
null_means <- null_dist |>
group_by(sim_number) |>
summarise(sim_mean = mean(y), .groups = "drop")
# P-value: proportion of null means more extreme than observed
p_value <- mean(abs(null_means$sim_mean) >= abs(observed_mean))
cat("The mean of observed data is:", observed_mean, "\n")
#> The mean of observed data is: 0.6205748
cat("The p-value is:", p_value, "\n")
#> The p-value is: 0Bootstrap confidence intervals:
# Bootstrap resampling
boot_data <- tidy_bootstrap(.x = observed_data, .num_sims = 2000)
# Calculate 95% CI
ci <- boot_data |>
bootstrap_unnest_tbl() |>
summarise(
lower = quantile(y, 0.025),
upper = quantile(y, 0.975)
)
cat("95% Confidence Interval:", ci$lower, "to", ci$upper, "\n")
#> 95% Confidence Interval: -1.22054 to 2.520635Determining required sample size:
# Simulate to estimate power
simulate_test <- function(n, effect_size, alpha = 0.05) {
group1 <- rnorm(n, mean = 0, sd = 1)
group2 <- rnorm(n, mean = effect_size, sd = 1)
t.test(group1, group2)$p.value < alpha
}
# Run many simulations
n_sims <- 1000
power <- mean(replicate(n_sims, simulate_test(n = 50, effect_size = 0.5)))
cat("Power:", power, "\n")
#> Power: 0.698tidy_normal(.n = 100, .num_sims = 5) |>
group_by(sim_number) |>
summarise(
mean = mean(y),
sd = sd(y),
median = median(y)
) |>
arrange(desc(mean))
#> # A tibble: 5 × 4
#> sim_number mean sd median
#> <fct> <dbl> <dbl> <dbl>
#> 1 5 0.0698 0.936 0.0550
#> 2 4 0.00723 0.965 0.0282
#> 3 2 -0.0125 0.888 -0.104
#> 4 1 -0.0960 1.00 0.00538
#> 5 3 -0.123 0.956 -0.0866data <- tidy_normal(.n = 100, .num_sims = 3)
# Custom ggplot
ggplot(data, aes(x = y, color = sim_number)) +
geom_density() +
theme_minimal() +
labs(
title = "Custom ggplot2 Density Plot",
x = "Value",
y = "Density",
color = "Simulation"
)library(tidyr)
data <- tidy_normal(.n = 100, .num_sims = 3)
# Widen data
wide_data <- data |>
select(sim_number, x, y) |>
pivot_wider(names_from = sim_number, values_from = y, names_prefix = "sim_")
head(wide_data)
#> # A tibble: 6 × 4
#> x sim_1 sim_2 sim_3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 -1.73 0.155 -1.09
#> 2 2 0.790 0.470 -0.550
#> 3 3 -2.01 0.0698 -3.10
#> 4 4 1.57 2.13 0.859
#> 5 5 0.722 0.105 -0.135
#> 6 6 -1.34 -1.88 -1.06library(purrr)
# Generate multiple distributions
distributions <- list(
normal = tidy_normal(.n = 100),
gamma = tidy_gamma(.n = 100, .shape = 2, .scale = 1),
beta = tidy_beta(.n = 100, .shape1 = 2, .shape2 = 5)
)
# Map over distributions
distributions |>
map(~ summarise(., mean = mean(y), sd = sd(y)))
#> $normal
#> # A tibble: 1 × 2
#> mean sd
#> <dbl> <dbl>
#> 1 -0.0108 1.02
#>
#> $gamma
#> # A tibble: 1 × 2
#> mean sd
#> <dbl> <dbl>
#> 1 1.98 1.20
#>
#> $beta
#> # A tibble: 1 × 2
#> mean sd
#> <dbl> <dbl>
#> 1 0.299 0.168Every TidyDensity function returns a structured tibble that works with tidyverse tools.
Understanding these four functions is key to working with distributions.
Use MLE for large samples, MVUE for small samples, compare with AIC.
Use withr::with_seed() for reproducible random number
generation with explicit scope.
Always plot your data and fitted distributions to validate assumptions.