--- title: "Examples and Use Cases" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Examples and Use Cases} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 4.5, fig.align = 'center', out.width = '95%', dpi = 100, message = FALSE, warning = FALSE ) ``` ```{r setup} set.seed(123) library(TidyDensity) library(dplyr) library(ggplot2) library(patchwork) ``` Real-world examples demonstrating TidyDensity in action across various domains and applications. ## Data Science and Analytics ### Example 1: Exploring Customer Purchase Amounts ```{r customer-purchases, fig.alt = "Density plot comparing the empirical distribution of simulated customer purchase data with a fitted gamma distribution, showing how TidyDensity can be used to identify the best-fitting distribution for business data"} # Simulate customer purchase data purchases <- c(rgamma(800, shape = 2, rate = 0.05), # Regular customers rexp(200, rate = 0.01)) # Big spenders # Fit multiple distributions normal_fit <- util_normal_param_estimate(purchases, .auto_gen_empirical = TRUE) gamma_fit <- util_gamma_param_estimate(purchases, .auto_gen_empirical = TRUE) lognormal_fit <- util_lognormal_param_estimate(purchases, .auto_gen_empirical = TRUE) # Compare using AIC aic_comparison <- data.frame( Distribution = c("Normal", "Gamma", "Log-Normal"), AIC = c( util_normal_aic(.x = purchases), util_gamma_aic(.x = purchases), util_lognormal_aic(.x = purchases) ) ) # Best distribution best_dist <- aic_comparison[which.min(aic_comparison$AIC), ] print(paste("Best fitting distribution:", best_dist$Distribution)) # Visualize best fit gamma_fit$combined_data_tbl |> tidy_combined_autoplot() + labs(title = "Customer Purchase Distribution", subtitle = paste("Best fit:", best_dist$Distribution), x = "Purchase Amount ($)", y = "Density") ``` ### Example 2: A/B Test Analysis with Bootstrap ```{r ab-test, fig.alt = "Density plot showing the bootstrap distribution of the difference in means between control and treatment groups in an A/B test, with vertical dashed lines indicating the 95% confidence interval bounds and a solid line at zero for reference"} # Simulate A/B test data group_a <- rnorm(100, mean = 50, sd = 10) # Control group_b <- rnorm(100, mean = 55, sd = 10) # Treatment # Bootstrap confidence intervals bootstrap_diff <- function(n_sims = 5000) { diffs <- numeric(n_sims) for (i in 1:n_sims) { boot_a <- sample(group_a, length(group_a), replace = TRUE) boot_b <- sample(group_b, length(group_b), replace = TRUE) diffs[i] <- mean(boot_b) - mean(boot_a) } return(diffs) } # Run bootstrap diff_distribution <- bootstrap_diff(5000) # Calculate CI ci <- quantile(diff_distribution, c(0.025, 0.975)) mean_diff <- mean(diff_distribution) # Visualize boot_tbl <- tidy_empirical(.x = diff_distribution, .num_sims = 1) tidy_autoplot(boot_tbl, .plot_type = "density") + geom_vline(xintercept = ci[1], linetype = "dashed", color = "red") + geom_vline(xintercept = ci[2], linetype = "dashed", color = "red") + geom_vline(xintercept = 0, linetype = "solid", color = "black") + labs(title = "A/B Test: Difference in Means", subtitle = sprintf("95%% CI: [%.2f, %.2f]", ci[1], ci[2]), x = "Difference (B - A)", y = "Density") # Decision if (ci[1] > 0) { cat("Result: Treatment B is significantly better (p < 0.05)\n") } else if (ci[2] < 0) { cat("Result: Control A is significantly better (p < 0.05)\n") } else { cat("Result: No significant difference detected\n") } ``` ### Example 3: Detecting Data Distribution Changes ```{r distribution-change, fig.alt = "Overlaid density plots comparing 'Before' and 'After' distributions to detect changes in data characteristics such as mean shift and variance changes over time"} # Before and after data before <- rnorm(500, mean = 100, sd = 15) after <- rnorm(500, mean = 105, sd = 20) # Mean shift and more variance # Generate distributions dist_before <- tidy_empirical(.x = before, .num_sims = 1) dist_after <- tidy_empirical(.x = after, .num_sims = 1) # Combine for visualization combined <- bind_rows( dist_before |> mutate(period = "Before"), dist_after |> mutate(period = "After") ) # Plot ggplot(combined, aes(x = y, fill = period)) + geom_density(alpha = 0.5) + labs(title = "Distribution Change Analysis", x = "Value", y = "Density", fill = "Period") + theme_minimal() # Statistical comparison cat("Before: Mean =", round(mean(before), 2), ", SD =", round(sd(before), 2), "\n") cat("After: Mean =", round(mean(after), 2), ", SD =", round(sd(after), 2), "\n") # Test for difference t_test_result <- t.test(after, before) cat("T-test p-value:", t_test_result$p.value, "\n") ``` ## Statistical Analysis ### Example 4: Power Analysis for Sample Size Determination ```{r power-analysis} # Simulate power analysis effect_sizes <- seq(0.2, 1.0, by = 0.1) sample_sizes <- seq(30, 200, by = 10) power_results <- expand.grid(effect_size = effect_sizes, sample_size = sample_sizes) # Calculate power for each combination calculate_power <- function(n, effect) { simulations <- 1000 significant <- 0 for (i in 1:simulations) { group1 <- rnorm(n, mean = 0, sd = 1) group2 <- rnorm(n, mean = effect, sd = 1) p_value <- t.test(group1, group2)$p.value if (p_value < 0.05) significant <- significant + 1 } return(significant / simulations) } # Run for a few key combinations (subset for speed) key_combos <- power_results[c(1, 50, 100, 150), ] |> na.omit() key_combos$power <- mapply(calculate_power, key_combos$sample_size, key_combos$effect_size) print(key_combos) ``` ### Example 5: Distribution Goodness-of-Fit Testing ```{r goodness-of-fit} # Generate data (actually from gamma, but test multiple) test_data <- rgamma(200, shape = 3, rate = 0.5) # Fit multiple distributions and test distributions <- list( "Normal" = function(x) util_normal_param_estimate(x, .auto_gen_empirical = FALSE), "Gamma" = function(x) util_gamma_param_estimate(x, .auto_gen_empirical = FALSE), "Weibull" = function(x) util_weibull_param_estimate(x, .auto_gen_empirical = FALSE), "Log-Normal" = function(x) util_lognormal_param_estimate(x, .auto_gen_empirical = FALSE) ) # Calculate AIC for each aic_results <- c( "Normal" = util_normal_aic(.x = test_data), "Gamma" = util_gamma_aic(.x = test_data), "Weibull" = util_weibull_aic(.x = test_data), "Log-Normal" = util_lognormal_aic(.x = test_data) ) # Results results_df <- data.frame( Distribution = names(aic_results), AIC = aic_results, Delta_AIC = aic_results - min(aic_results) ) |> arrange(AIC) print(results_df) cat("\nBest fitting distribution:", results_df$Distribution[1], "\n") ``` ## Finance and Risk Management ### Example 6: Value at Risk (VaR) Calculation ```{r var-calculation, fig.alt = "Density plot of simulated daily portfolio returns showing the distribution with vertical dashed lines marking the 95% and 99% Value at Risk thresholds, used for risk management analysis"} # Simulate daily returns returns <- rnorm(1000, mean = 0.001, sd = 0.02) # 0.1% daily return, 2% volatility # Create distribution returns_dist <- tidy_empirical(.x = returns, .num_sims = 1) # Calculate VaR at different confidence levels var_95 <- quantile(returns, 0.05) var_99 <- quantile(returns, 0.01) var_995 <- quantile(returns, 0.005) # Bootstrap for VaR confidence intervals boot_returns <- tidy_bootstrap(.x = returns, .num_sims = 2000) boot_var <- boot_returns |> bootstrap_unnest_tbl() |> group_by(sim_number) |> summarise(var_95 = quantile(y, 0.05)) |> ungroup() var_ci <- quantile(boot_var$var_95, c(0.025, 0.975)) # Results cat("Value at Risk (VaR) Analysis\n") cat("95% VaR:", round(var_95 * 100, 3), "%\n") cat("99% VaR:", round(var_99 * 100, 3), "%\n") cat("99.5% VaR:", round(var_995 * 100, 3), "%\n") cat("\n95% CI for 95% VaR: [", round(var_ci[1] * 100, 3), "%,", round(var_ci[2] * 100, 3), "%]\n") # Visualize tidy_autoplot(returns_dist, .plot_type = "density") + geom_vline(xintercept = var_95, color = "orange", linetype = "dashed") + geom_vline(xintercept = var_99, color = "red", linetype = "dashed") + annotate("text", x = var_95, y = 0, label = "95% VaR", vjust = -1) + annotate("text", x = var_99, y = 0, label = "99% VaR", vjust = -1) + labs(title = "Portfolio Returns Distribution with VaR", x = "Daily Return", y = "Density") ``` ### Example 7: Option Pricing with Monte Carlo Simulation ```{r option-pricing, fig.alt = "Density plot showing the simulated distribution of stock prices at option maturity using geometric Brownian motion, with vertical lines indicating the current stock price and strike price for option pricing analysis"} # Simulate stock price paths using GBM stock_price_simulation <- function(S0, mu, sigma, T, n_steps, n_sims) { dt <- T / n_steps paths <- matrix(0, nrow = n_steps + 1, ncol = n_sims) paths[1, ] <- S0 for (i in 2:(n_steps + 1)) { Z <- rnorm(n_sims) paths[i, ] <- paths[i-1, ] * exp((mu - 0.5 * sigma^2) * dt + sigma * sqrt(dt) * Z) } return(paths) } # Parameters S0 <- 100 # Initial stock price K <- 105 # Strike price mu <- 0.05 # Drift sigma <- 0.2 # Volatility T <- 1 # Time to maturity (1 year) n_steps <- 252 # Daily steps n_sims <- 1000 # Simulate paths <- stock_price_simulation(S0, mu, sigma, T, n_steps, n_sims) # Final prices final_prices <- paths[nrow(paths), ] # Option payoffs call_payoffs <- pmax(final_prices - K, 0) put_payoffs <- pmax(K - final_prices, 0) # Expected values (discounted) r <- 0.03 # Risk-free rate call_price <- mean(call_payoffs) * exp(-r * T) put_price <- mean(put_payoffs) * exp(-r * T) cat("Call Option Price:", round(call_price, 2), "\n") cat("Put Option Price:", round(put_price, 2), "\n") # Visualize final price distribution final_dist <- tidy_empirical(.x = final_prices, .num_sims = 1) tidy_autoplot(final_dist, .plot_type = "density") + geom_vline(xintercept = K, color = "red", linetype = "dashed") + geom_vline(xintercept = S0, color = "blue", linetype = "dashed") + labs(title = "Simulated Stock Price Distribution at Maturity", x = "Stock Price", y = "Density") ``` ## Quality Control and Manufacturing ### Example 8: Process Capability Analysis ```{r process-capability, fig.alt = "Density plot showing the distribution of manufacturing measurements overlaid with specification limits (LSL and USL) and target value, with process capability indices Cp and Cpk displayed in the subtitle"} # Manufacturing process data measurements <- rnorm(500, mean = 100, sd = 2) # Target = 100 LSL <- 94 # Lower specification limit USL <- 106 # Upper specification limit # Fit distribution fit <- util_normal_param_estimate(measurements, .auto_gen_empirical = TRUE) mean_est <- fit$parameter_tbl$mu[1] sd_est <- fit$parameter_tbl$stan_dev[1] # Calculate Cp and Cpk Cp <- (USL - LSL) / (6 * sd_est) Cpu <- (USL - mean_est) / (3 * sd_est) Cpl <- (mean_est - LSL) / (3 * sd_est) Cpk <- min(Cpu, Cpl) # Defect rate defect_rate <- (pnorm(LSL, mean_est, sd_est) + (1 - pnorm(USL, mean_est, sd_est))) * 100 # Results cat("Process Capability Analysis\n") cat("Cp:", round(Cp, 3), "\n") cat("Cpk:", round(Cpk, 3), "\n") cat("Estimated Defect Rate:", round(defect_rate, 4), "%\n") # Interpretation if (Cpk >= 1.33) { cat("Process capability: GOOD\n") } else if (Cpk >= 1.0) { cat("Process capability: ADEQUATE\n") } else { cat("Process capability: POOR - Action required\n") } # Visualize fit$combined_data_tbl |> tidy_combined_autoplot() + geom_vline(xintercept = LSL, color = "red", linetype = "dashed") + geom_vline(xintercept = USL, color = "red", linetype = "dashed") + geom_vline(xintercept = 100, color = "blue", linetype = "solid") + labs(title = "Process Capability Analysis", subtitle = sprintf("Cp = %.3f, Cpk = %.3f", Cp, Cpk), x = "Measurement", y = "Density") ``` ### Example 9: Control Chart Analysis ```{r control-chart, fig.alt = "X-bar control chart showing process measurements over time with horizontal lines for the mean and upper/lower control limits (UCL/LCL), and a vertical line indicating where a process shift occurred"} # Generate process data with a shift n_before <- 200 n_after <- 50 before_shift <- rnorm(n_before, mean = 100, sd = 3) after_shift <- rnorm(n_after, mean = 105, sd = 3) # Process shifted all_data <- c(before_shift, after_shift) time <- 1:length(all_data) # Calculate control limits mean_val <- mean(before_shift) sd_val <- sd(before_shift) UCL <- mean_val + 3 * sd_val LCL <- mean_val - 3 * sd_val # Create control chart control_data <- data.frame(time = time, value = all_data) ggplot(control_data, aes(x = time, y = value)) + geom_line() + geom_point() + geom_hline(yintercept = mean_val, color = "blue") + geom_hline(yintercept = UCL, color = "red", linetype = "dashed") + geom_hline(yintercept = LCL, color = "red", linetype = "dashed") + geom_vline(xintercept = n_before, color = "orange", linetype = "dotted") + labs(title = "X-bar Control Chart", subtitle = "Process Shift Detected", x = "Sample", y = "Measurement") + theme_minimal() ``` ## Healthcare and Epidemiology ### Example 10: Disease Outbreak Modeling ```{r disease-outbreak, fig.alt = "Two-panel display showing (1) a time series of daily disease cases with observed counts as points and the expected rate as a red dashed line, and (2) a density plot of the case distribution during the peak outbreak period fitted with a Poisson distribution"} # Model daily case counts with Poisson distribution days <- 1:90 baseline_rate <- 5 peak_rate <- 50 # Simulate outbreak with time-varying rate lambda <- baseline_rate + (peak_rate - baseline_rate) * exp(-((days - 45)^2) / (2 * 15^2)) # Gaussian curve cases <- rpois(length(days), lambda) # Fit Poisson distribution to peak period peak_cases <- cases[35:55] poisson_fit <- util_poisson_param_estimate(peak_cases, .auto_gen_empirical = TRUE) # Visualize time series outbreak_data <- data.frame(day = days, cases = cases, expected = lambda) p1 <- ggplot(outbreak_data, aes(x = day, y = cases)) + geom_point() + geom_line(aes(y = expected), color = "red", linetype = "dashed") + labs(title = "Disease Outbreak Time Series", subtitle = "Observed Cases vs Expected Rate", x = "Day", y = "Number of Cases") + theme_minimal() # Peak analysis p2 <- poisson_fit$combined_data_tbl |> tidy_combined_autoplot() + labs(title = "Peak Period Case Distribution", x = "Daily Cases", y = "Density") p1 / p2 ``` ## Machine Learning and Simulation ### Example 11: Monte Carlo Simulation for Project Planning ```{r project-planning, fig.alt = "Density plot showing the Monte Carlo simulated distribution of total project duration with vertical lines marking the 10th percentile (optimistic), 50th percentile (median), and 90th percentile (pessimistic) estimates"} # Project task duration simulation using triangular distribution tasks <- list( task1 = list(min = 5, mode = 8, max = 15), task2 = list(min = 3, mode = 5, max = 10), task3 = list(min = 7, mode = 10, max = 20), task4 = list(min = 2, mode = 4, max = 8) ) # Simulate total project duration n_sims <- 5000 total_durations <- numeric(n_sims) for (i in 1:n_sims) { task_durations <- sapply(tasks, function(task) { tidy_triangular(.n = 2, .min = task$min, .max = task$max, .mode = task$mode)$y[1] }) total_durations[i] <- sum(task_durations) } # Analyze results duration_dist <- tidy_empirical(.x = total_durations, .num_sims = 1) percentiles <- quantile(total_durations, c(0.1, 0.5, 0.9)) cat("Project Duration Analysis\n") cat("10th percentile (optimistic):", round(percentiles[1], 1), "days\n") cat("50th percentile (median):", round(percentiles[2], 1), "days\n") cat("90th percentile (pessimistic):", round(percentiles[3], 1), "days\n") # Visualize tidy_autoplot(duration_dist, .plot_type = "density") + geom_vline(xintercept = percentiles[1], color = "green", linetype = "dashed") + geom_vline(xintercept = percentiles[2], color = "blue", linetype = "solid") + geom_vline(xintercept = percentiles[3], color = "red", linetype = "dashed") + labs(title = "Project Duration Distribution", subtitle = "Monte Carlo Simulation (5000 iterations)", x = "Total Duration (days)", y = "Density") ``` ## Education and Teaching ### Example 12: Demonstrating Central Limit Theorem ```{r central-limit-theorem, fig.alt = "Four-panel density plot showing how the distribution of sample means from a uniform distribution becomes more normal as sample size increases from n=5 to n=50, demonstrating the Central Limit Theorem"} # Show CLT with different source distributions n_samples <- 1000 sample_sizes <- c(5, 10, 30, 50) demonstrate_clt <- function(source_dist_func, sample_size, n_samples) { sample_means <- numeric(n_samples) for (i in 1:n_samples) { sample <- source_dist_func(sample_size) sample_means[i] <- mean(sample) } return(sample_means) } # Source: Uniform distribution uniform_source <- function(n) runif(n, 0, 1) # Generate for different sample sizes results <- lapply(sample_sizes, function(n) { means <- demonstrate_clt(uniform_source, n, n_samples) tidy_empirical(.x = means, .num_sims = 1) |> mutate(sample_size = n) }) # Combine and plot combined <- bind_rows(results) ggplot(combined, aes(x = y)) + geom_density(fill = "skyblue", alpha = 0.5) + facet_wrap(~sample_size, scales = "free", labeller = labeller(sample_size = function(x) paste("n =", x))) + labs(title = "Central Limit Theorem Demonstration", subtitle = "Sample Means from Uniform(0,1) Distribution", x = "Sample Mean", y = "Density") + theme_minimal() ``` ## Tips for Your Own Use Cases ### 1. Start with Exploration ```{r tip-exploration, eval = FALSE} # Explore your data summary(your_data) hist(your_data) # Try empirical distribution emp <- tidy_empirical(.x = your_data, .num_sims = 1) tidy_autoplot(emp, .plot_type = "density") ``` ### 2. Try Multiple Distributions ```{r tip-multiple-dist, eval = FALSE} # Systematic comparison distributions <- c("normal", "gamma", "lognormal", "weibull") aics <- sapply(distributions, function(d) { func <- get(paste0("util_", d, "_aic")) func(.x = your_data) }) best <- distributions[which.min(aics)] ``` ### 3. Validate with Visualization ```{r tip-visualization, eval = FALSE} # Always visualize fit <- get(paste0("util_", best, "_param_estimate"))( your_data, .auto_gen_empirical = TRUE ) fit$combined_data_tbl |> tidy_combined_autoplot() ``` ### 4. Use Bootstrap for Uncertainty ```{r tip-bootstrap, eval = FALSE} # Add bootstrap confidence intervals boot <- tidy_bootstrap(.x = your_data, .num_sims = 2000) boot |> bootstrap_unnest_tbl() |> summarise( lower = quantile(y, 0.025), upper = quantile(y, 0.975) ) ``` ## Key Takeaways ### 1. Distribution Fitting is Essential Use AIC to compare multiple candidate distributions and select the best fit for your data. ### 2. Bootstrap Provides Robust Uncertainty When parametric assumptions are uncertain, bootstrap resampling gives reliable confidence intervals. ### 3. Visualization Validates Analysis Always plot your results to verify that fitted distributions match the empirical data. ### 4. TidyDensity Integrates with Tidyverse Leverage dplyr, ggplot2, and other tidyverse tools for seamless analysis workflows. ### 5. Domain Knowledge Guides Choices Let your understanding of the problem domain inform distribution selection and interpretation.