Skip to content

Commit

Permalink
Document main methods and improve underlying algorithm computation
Browse files Browse the repository at this point in the history
  • Loading branch information
coatless committed Sep 4, 2024
1 parent 5c7e208 commit 8547b0a
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 48 deletions.
111 changes: 74 additions & 37 deletions R/surreal.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#' Transform Data for X and Y Finding Algorithm by Adding a Border
#' Transform Data by Adding a Border
#'
#' This function transforms the input data by adding points around the original data
#' to create a frame. It uses an optimization process to find the best alpha parameter
#' for point distribution.
#' for point distribution, which helps in making the fitted values and residuals orthogonal.
#'
#' @param x Numeric vector of x coordinates
#' @param y Numeric vector of y coordinates
#' @param n_add_points Integer. Number of points to add on each side of the frame. Default is 40.
#' @param verbose Logical. If TRUE, prints optimization progress. Default is FALSE.
#' @param x Numeric vector of x coordinates.
#' @param y Numeric vector of y coordinates.
#' @param n_add_points Integer. Number of points to add on each side of the frame. Default is `40`.
#' @param verbose Logical. If `TRUE`, prints optimization progress. Default is `FALSE`.
#'
#' @return
#' A matrix with two columns representing the transformed x and y coordinates.
#' A matrix with two columns representing the transformed `x` and `y` coordinates.
#'
#' @export
#' @examples
Expand All @@ -24,11 +24,11 @@
#'
#' @importFrom stats optimize lm coef
border_augmentation <- function(x, y, n_add_points = 40, verbose = FALSE) {
# Constants
# Define constants for frame size and shift
FRAME <- 0.05
SHIFT <- 1 + 2 * FRAME

# Calculate ranges and deltas
# Helper function to calculate range and delta for x and y
calculate_range <- function(values) {
range <- range(values)
delta <- diff(range)
Expand All @@ -38,10 +38,13 @@ border_augmentation <- function(x, y, n_add_points = 40, verbose = FALSE) {
)
}

# Calculate ranges and deltas for x and y
x_data <- calculate_range(x)
y_data <- calculate_range(y)

# Helper function to generate points based on alpha
generate_points <- function(alpha, range, delta) {
# Create sequence of points based on alpha value
pkt <- if (alpha <= 1) seq(0, 1, length.out = n_add_points)^alpha
else 1 - seq(0, 1, length.out = n_add_points)^(2 - alpha)
list(
Expand All @@ -50,6 +53,7 @@ border_augmentation <- function(x, y, n_add_points = 40, verbose = FALSE) {
)
}

# Helper function to combine original and generated points
combine_points <- function(alpha) {
x_points <- generate_points(alpha, x_data$range, x_data$delta)
y_points <- generate_points(alpha, y_data$range, y_data$delta)
Expand All @@ -62,16 +66,17 @@ border_augmentation <- function(x, y, n_add_points = 40, verbose = FALSE) {
list(xx = xx, yy = yy)
}

# Optimization function to find best alpha
optimize_alpha <- function(alpha) {
points <- combine_points(alpha)
abs(stats::coef(stats::lm(points$yy ~ points$xx))[2])
}

# Find optimal alpha
# Find optimal alpha using optimization
optimal_alpha <- stats::optimize(optimize_alpha, lower = 0, upper = 2)$minimum
if (verbose) cat("Optimal alpha:", optimal_alpha, "\n")

# Generate final points
# Generate final points using optimal alpha
final_points <- combine_points(optimal_alpha)
cbind(final_points$xx, final_points$yy)
}
Expand All @@ -85,9 +90,7 @@ border_augmentation <- function(x, y, n_add_points = 40, verbose = FALSE) {
#'
#' @inheritParams surreal
#'
#' @return
#' A list with two elements:
#'
#' @return A list with two elements:
#' \describe{
#' \item{X}{The generated X matrix}
#' \item{y}{The generated y vector}
Expand All @@ -97,52 +100,60 @@ border_augmentation <- function(x, y, n_add_points = 40, verbose = FALSE) {
#' @noRd
find_X_y_core <- function(y_hat, R_0, R_squared = 0.3, p = 5, max_iter = 100, tolerance = 0.01, verbose = FALSE) {
n <- length(R_0)

# Scale y_hat to achieve desired R-squared
y_hat <- sd(R_0) / sd(y_hat) * sqrt(R_squared / (1 - R_squared)) * y_hat

beta_0 <- 0
beta_p <- seq_len(p)
j_star <- p
# Initialize parameters
beta <- c(0, seq_len(p)) # beta_0 and beta_{1:p} combined
j_star <- p + 1 # Adjusting for 1-based indexing in R

# Generate random noise
Z <- rnorm(n, sd = sd(R_0))
M <- matrix(rnorm(n * p, sd = sd(y_hat)), n, p)

P_R_0 <- R_0 %*% t(R_0) / (R_0 %*% R_0)[1]
M_jstar_old <- M[, j_star]
# Calculate projection matrix
P_R_0 <- tcrossprod(R_0) / sum(R_0^2)

# Iterative optimization
for (i in seq_len(max_iter)) {
W <- cbind(1, (diag(n) - P_R_0) %*% M)
A_M <- W %*% solve(t(W) %*% W) %*% t(W)
SUM_beta_M_all <- M %*% beta_p
FIRST <- y_hat - beta_0 - A_M %*% Z + P_R_0 %*% M %*% beta_p - SUM_beta_M_all
M[, j_star] <- 1 / beta_p[j_star] * (FIRST + beta_p[j_star] * M[, j_star])
A_M <- W %*% solve(crossprod(W), t(W))

SUM_beta_M_all <- M %*% beta[-1] # Exclude beta_0
FIRST <- y_hat - beta[1] - A_M %*% Z + P_R_0 %*% M %*% beta[-1] - SUM_beta_M_all

h <- M[, j_star]
h_delta_sum <- sum((h - M_jstar_old)^2)
M_new <- M
M_new[, j_star - 1] <- (FIRST + beta[j_star] * M[, j_star - 1]) / beta[j_star]

if (verbose) print(h_delta_sum)
if (h_delta_sum < tolerance) break
delta <- sum((M_new - M)^2)

M_jstar_old <- h
if (verbose) {
cat("Iteration", i, "- Delta:", delta, "\n")
}

if (delta < tolerance) break

M <- M_new
}

# Calculate final X and Y
eps <- R_0 + A_M %*% Z
X <- (diag(n) - P_R_0) %*% M
Y <- beta_0 + X %*% beta_p + eps
Y <- beta[1] + X %*% beta[-1] + eps

list(y = Y, X = X)
}


#' Find X Matrix and Y Vector for Residual Surrealism
#'
#' This function implements the Residual (Sur)Realism algorithm as described by
#' Leonard A. Stefanski (2007). It finds a matrix X and vector y such that the
#' fitted values and residuals of lm(y ~ X) are similar to the inputs y_hat and R_0.
#'
#' @param data A data frame or matrix with two columns representing the `y_hat` and `R_0` values.
#' @param y_hat Numeric vector of desired fitted values (only used if `data` is not provided)
#' @param R_0 Numeric vector of desired residuals (only used if `data` is not provided)
#' @param y_hat Numeric vector of desired fitted values (only used if `data` is not provided).
#' @param R_0 Numeric vector of desired residuals (only used if `data` is not provided).
#' @param R_squared Desired R-squared value. Default is 0.3.
#' @param p Integer. Desired number of columns for matrix X. Default is 5.
#' @param n_add_points Integer. Number of points to add in border transformation. Default is 40.
Expand All @@ -151,14 +162,15 @@ find_X_y_core <- function(y_hat, R_0, R_squared = 0.3, p = 5, max_iter = 100, to
#' @param verbose Logical. If TRUE, prints progress information. Default is FALSE.
#'
#' @return
#' A data frame with containing the generated X matrix and y vector.
#' A data frame containing the generated X matrix and y vector.
#'
#' @details
#' To disable the border augmentation, please set `n_add_points = 0`.#'
#' To disable the border augmentation, set `n_add_points = 0`.
#'
#' @importFrom stats rnorm sd
#' @importFrom graphics pairs
#' @export
#'
#' @examples
#' # Generate a 2D data set
#' data <- cbind(y = rnorm(100), x = rnorm(100))
Expand Down Expand Up @@ -189,33 +201,58 @@ surreal <- function(
n_add_points = 40,
max_iter = 100, tolerance = 0.01, verbose = FALSE) {

# Input validation
if (R_squared <= 0 || R_squared >= 1) {
stop("`R_squared` must be between 0 and 1 (supplied: ", R_squared ,")")
}
if (p < 1) {
stop("`p` must be at least 1 (supplied: ", p ,")")
}
if (n_add_points < 0) {
stop("`n_add_points` must be a non-negative integer (supplied: ", n_add_points ,")")
}
if (max_iter < 1) {
stop("`max_iter` must be at least 1 (supplied: ", max_iter ,")")
}
if (tolerance <= 0) {
stop("`tolerance` must be a positive number (supplied: ", tolerance ,")")
}

# Check if data is provided and extract y_hat and R_0
if (!missing(data) && (is.data.frame(data) | is.matrix(data)) && ncol(data) == 2) {
y_hat <- as.vector(data[, 1])
R_0 <- as.vector(data[, 2])
}

if (length(y_hat) != length(R_0)) {
stop("`y_hat` and `R_0` must have the same length. (", length(y_hat) ,"!= ", length(R_0) ,")")
}

# Plot original data if verbose
if (verbose) {
plot(y_hat, R_0, main = "Original data", xlab = '', ylab = '')
}

# Apply bordering to data
# Apply bordering to data if n_add_points > 0
if (n_add_points > 0) {
xy <- border_augmentation(y_hat, R_0, n_add_points = n_add_points, verbose = verbose)
} else {
xy <- cbind(y_hat, R_0)
}

# Extract transformed y_hat and R_0
y_hat <- xy[, 1]
R_0 <- xy[, 2] - mean(xy[, 2])

# Find X and y
# Find X and y using core algorithm
data <- find_X_y_core(
y_hat, R_0, R_squared = R_squared, p = p,
max_iter = max_iter, tolerance = tolerance, verbose = verbose)

# Re-do the data frame
# Create result data frame
result <- data.frame(y = data$y, X = data$X)

# Plot transformed data and residuals if verbose
if (verbose) {
pairs(~ ., data = result, main = "Data after transformation")
res <- lm(y ~ ., data = result)
Expand Down
14 changes: 7 additions & 7 deletions man/border_augmentation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/surreal.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 8547b0a

Please sign in to comment.