0% found this document useful (0 votes)
13 views8 pages

Multivariate Normal Data Analysis Guide

The document contains R code for generating multivariate normal data, performing statistical analyses including maximum likelihood estimation (MLE) of mean and covariance, calculating partial correlations, fitting linear regression models, and conducting Hotelling's T-squared tests. It also explores the relationship between multivariate normal distributions and Chi-square statistics, along with simulations to analyze sample covariance matrices and their properties. Various statistical visualizations such as histograms and boxplots are included to illustrate the results.

Uploaded by

Abdullah-Al Emon
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views8 pages

Multivariate Normal Data Analysis Guide

The document contains R code for generating multivariate normal data, performing statistical analyses including maximum likelihood estimation (MLE) of mean and covariance, calculating partial correlations, fitting linear regression models, and conducting Hotelling's T-squared tests. It also explores the relationship between multivariate normal distributions and Chi-square statistics, along with simulations to analyze sample covariance matrices and their properties. Various statistical visualizations such as histograms and boxplots are included to illustrate the results.

Uploaded by

Abdullah-Al Emon
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

lol

#####A

#step 1

[Link](123) # reproducibility

mu <- c(4, 3, 2) # mean vector


Sigma <- matrix(c(4,1,0,
1,2,1,
0,1,3),
nrow=3, byrow=TRUE)
#step 2

# generate 10000 samples


data <- mvrnorm(n=10000, mu=mu, Sigma=Sigma) #we got 10000x3 dimensional matrix
data <- [Link](data) #matrix to data frame for data
manipulation
colnames(data) <- c("X1","X2","X3")

head(data)

#(a)boxplot and histogram

# Boxplots
boxplot(data, main="Boxplots of X1, X2, X3")

# Histograms
par(mfrow=c(1,3))
hist(data$X1, main="Histogram of X1", col="skyblue", breaks=30)
hist(data$X2, main="Histogram of X2", col="skyblue", breaks=30)
hist(data$X3, main="Histogram of X3", col="skyblue", breaks=30)
par(mfrow=c(1,1))

#(b)MLE

# For n = 500
[Link](1)
data500 <- mvrnorm(n=500, mu=mu, Sigma=Sigma)
mean500 <- colMeans(data500)
cov500 <- cov(data500) * (499/500) # convert sample covariance to MLE

# For n = 5000
[Link](2)
data5000 <- mvrnorm(n=5000, mu=mu, Sigma=Sigma)
mean5000 <- colMeans(data5000)
cov5000 <- cov(data5000) * (4999/5000)

mean500; cov500
mean5000; cov5000

#manually

# -------------------------------
# Generate data (same as before)
# -------------------------------
[Link](123)

mu <- c(4, 3, 2)
Sigma <- matrix(c(4, 1, 0,
1, 2, 1,
0, 1, 3), nrow=3, byrow=TRUE)

library(MASS)
data <- mvrnorm(n=10000, mu=mu, Sigma=Sigma)
df <- [Link](data)
colnames(df) <- c("X1", "X2", "X3")

# -------------------------------
# Step 1: Correlation matrix
# -------------------------------
R <- cor(df)
print(R)

# Extract correlations
r12 <- R["X1", "X2"]
r13 <- R["X1", "X3"]
r23 <- R["X2", "X3"]

# -------------------------------
# Step 2: Manual formula method
# -------------------------------
r12_3 <- (r12 - r13*r23) / sqrt((1-r13^2) * (1-r23^2))
cat("Partial correlation r12.3 (formula) =", r12_3, "\n")

# -------------------------------
# Step 3: Inverse correlation matrix method
# -------------------------------
R_inv <- solve(R)
pcor_12_3 <- -R_inv[1,2] / sqrt(R_inv[1,1] * R_inv[2,2])
cat("Partial correlation r12.3 (matrix inversion) =", pcor_12_3, "\n")

#(d)

# Fit linear regression


fit <- lm(X1 ~ X2 + X3, data=data)
summary(fit)

# Multiple correlation coefficient R


R <- sqrt(summary(fit)$[Link])
R

ques2

# Data
X <- matrix(c(6,10, 8,9, 6,3), ncol=2, byrow=TRUE)

# Hypothesized mean vector


mu0 <- c(9, 5)

# Sample mean
xbar <- colMeans(X)

# Sample covariance matrix


S <- cov(X)

# Hotelling's T^2
n <- nrow(X)
d <- ncol(X)
diff <- xbar - mu0
T2 <- n * t(diff) %*% solve(S) %*% diff
T2

#step 2

# Convert to F-statistic
F_stat <- ((n - d) / (d * (n - 1))) * T2
p_value <- 1 - pf(F_stat, d, n - d)

F_stat
p_value

#step3

alpha <- 0.05


t_crit <- qt(1 - alpha/(2*d), df=n-1)

# Diagonal variances
s_diag <- diag(S)

# Standard errors
se <- sqrt(s_diag / n)

# Bonferroni CIs
CIs <- cbind(
lower = xbar - t_crit * se,
upper = xbar + t_crit * se
)
CIs

ques 3

# Load library
library(MASS) # for mvrnorm

[Link](123)

# Parameters
mu <- c(1, 3, 2) # mean vector
Sigma <- matrix(c(5, 1, 0,
1, 2, 1,
0, 1, 3), nrow = 3, byrow = TRUE)
# (a) Simulate 1000 samples from multivariate normal
n <- 1000
samples <- mvrnorm(n = n, mu = mu, Sigma = Sigma)

# Sample covariance matrix (Wishart distributed)


S <- cov(samples)
S

#(b)

# Create a transformation matrix A


A <- matrix(c(1,2,0,
0,1,1,
1,0,1), nrow=3, byrow=TRUE)

# Transform the data


trans_samples <- t(A %*% t(samples))

# Covariance of transformed data


S_trans <- cov(trans_samples)
S_trans

A*Sigma*t(A)

#(c)

# Generalized variance
gen_var <- det(S)
gen_var

# Check degeneracy
if (abs(gen_var) < 1e-8) {
cat("The Wishart matrix is degenerate (determinant ~ 0)\n")
} else {
cat("The Wishart matrix is non-degenerate (determinant =", gen_var, ")\n")
}

##B

#1111111111111111111111111
#aa## MLE of mean and covariance for n = 500 and 700

library(MASS)

mu <- c(2, 3, 4)
Sigma <- matrix(c(4,1,0,
1,2,1,
0,1,3), nrow=3, byrow=TRUE)

[Link](123)
X <- mvrnorm(1000, mu, Sigma)

# For n = 500
X1 <- X[1:500,]
mean_500 <- colMeans(X1)
cov_500 <- cov(X1) * (499/500) # MLE uses 1/n instead of 1/(n-1)
# For n = 700
X2 <- X[1:700,]
mean_700 <- colMeans(X2)
cov_700 <- cov(X2) * (699/700)

mean_500; cov_500
mean_700; cov_700

#bbbbbbbbbbbb Marginal probability for the first variable bbbbbbbbbbb

# X1 ~ N(2, 4)
mu1 <- 2
sd1 <- sqrt(4)

# Example: P(1 < X1 < 3)


p <- pnorm(3, mu1, sd1) - pnorm(1, mu1, sd1)
p

#ccccccccccccccccc Regression model: X1 ~ X2 + X3

data <- [Link](X1 = X[,1], X2 = X[,2], X3 = X[,3])


model <- lm(X1 ~ X2 + X3, data = data)
summary(model)

# Multiple correlation
R <- sqrt(summary(model)$[Link])
R

#222222222222222222222222222

# ============================================================
# Question: Multivariate Normal and Chi-Square Relationship
# ============================================================

# Given:
k <- 6 # number of components (degrees of freedom)
n <- 1000 # number of observations

# ------------------------------------------------------------
# (a) Generate sample from standard normal N(0,1)
# ------------------------------------------------------------
[Link](123) # for reproducibility
X <- matrix(rnorm(n * k, mean = 0, sd = 1), nrow = n, ncol = k)

# Interpretation:
# Each row of X represents one observation of the multivariate normal vector
# X ~ N(0, I_k). Each column is one component (independent N(0,1)).

# ------------------------------------------------------------
# (b) Compute Chi-square statistic for each observation
# ------------------------------------------------------------
chi_sq_values <- rowSums(X^2)

# Interpretation:
# Since each observation has k = 6 independent N(0,1) components,
# their sum of squares follows Chi-square(k=6) distribution.

# ------------------------------------------------------------
# (c) i. Calculate PDF of Chi-square(6) at x = 8
# ------------------------------------------------------------
x_value <- 8
pdf_val <- dchisq(x_value, df = k)
pdf_val

# Interpretation:
# The PDF gives the height of the Chi-square(6) density curve at x = 8.
# It represents how likely values near 8 are, relative to other points.

# ------------------------------------------------------------
# (c) ii. Calculate CDF at x = 8 for Chi-square(6)
# ------------------------------------------------------------
cdf_val <- pchisq(x_value, df = k)
cdf_val

# Interpretation:
# The CDF gives P(Chi-square(6) ≤ 8),
# i.e., the probability that a Chi-square(6) random variable takes a value ≤ 8.

# ------------------------------------------------------------
# (d) Plot histogram and overlay theoretical PDF
# ------------------------------------------------------------
hist(chi_sq_values, breaks = 30, probability = TRUE,
main = expression("Histogram of Chi-square Values with Theoretical PDF
Overlay"),
xlab = expression(chi^2 ~ "values"), col = "lightblue", border = "gray")

# Overlay theoretical chi-square(6) PDF curve


curve(dchisq(x, df = k), col = "red", lwd = 2, add = TRUE)

# Interpretation:
# The histogram represents the simulated distribution of chi-square statistics
# from our generated multivariate normal data.
# The red line shows the theoretical Chi-square(6) probability density function.
# A close match between the histogram and the red curve verifies that
# the sum of squares of 6 standard normal variables indeed follows Chi-square(6).

# Data-333333333333333333333
X <- matrix(c(
580,516,613,750,
473,319,514,963,
664,369,782,107,
739,193,293,530,
143,853,927,121,
127,632,512,837,
703,551,936,118,
108,578,856,113,
185,74,244,663,
111,544,618,816,
815,365,500,930,
770,522,542,570,
759,205,443,789,
928,360,402,611,
849,137,396,700
), ncol=4, byrow=TRUE)

colnames(X) <- c("X1","X2","X3","X4")


# Hypothesized mean vector
mu0 <- c(208, 400, 500, 500)

# Sample statistics
n <- nrow(X)
p <- ncol(X)
xbar <- colMeans(X)
S <- cov(X)

# Hotelling T^2
diff <- xbar - mu0
T2 <- n * t(diff) %*% solve(S) %*% diff

# Convert to F statistic
Fstat <- (n - p) / (p * (n - 1)) * T2
pval <- 1 - pf(Fstat, df1=p, df2=n-p)

list(
SampleMean = xbar,
T2 = T2,
Fstat = Fstat,
pvalue = pval
)

#44444444444444444444

# -------------------------
# Simulation parameters
# -------------------------
[Link](123) # reproducible
n <- 800 # number of samples
p <- 3 # dimension
mu <- c(1, 4, 5) # mean vector
Sigma <- matrix(c(5,1,0,
1,2,1,
0,1,3), nrow=3, byrow=TRUE)

# ---------- (simulate) ----------


# Use MASS::mvrnorm to draw n samples from N_p(mu, Sigma)
if(!requireNamespace("MASS", quietly = TRUE)) [Link]("MASS")
library(MASS)
X <- MASS::mvrnorm(n = n, mu = mu, Sigma = Sigma)

# ---------- (a) sample covariance matrix ----------


# sample mean
xbar <- colMeans(X)
# scatter (sum of (x - xbar)(x - xbar)')
A <- t(X - matrix(xbar, nrow=n, ncol=p, byrow=TRUE)) %*% (X - matrix(xbar, nrow=n,
ncol=p, byrow=TRUE))
# sample covariance (unbiased)
S_sample <- A / (n - 1)

cat("Sample covariance matrix (S):\n")


print(S_sample)

# Theoretical note:
# If X_i ~ N_p(mu, Sigma) and we form A = sum_{i=1}^n (X_i - xbar)(X_i - xbar)',
# then A ~ Wishart_p(n-1, Sigma). Equivalently (n-1) * S_sample ~ Wishart_p(n-1,
Sigma).

# ---------- (b) Wishart -> Chi-square relationship ----------


# A property: If A ~ Wishart_p(m, Sigma) then
# T = trace(Sigma^{-1} %*% A) = sum_{i=1}^m (Z_i' Z_i)
# where each Z_i ~ N_p(0, I_p), hence T ~ Chi-square with df = p * m.
m <- n - 1 # degrees of freedom parameter for Wishart (when
centered by sample mean)
df_chi <- p * m # degrees of freedom for trace statistic

invSigma <- solve(Sigma)


T_stat <- sum(diag(invSigma %*% A)) # trace(invSigma %*% A)

# p-value for observed T_stat under Chi-square(df_chi)


p_value <- 1 - pchisq(T_stat, df = df_chi)

cat("\nTrace statistic T = trace(Sigma^{-1} A):", T_stat, "\n")


cat("Chi-square df (p * m):", df_chi, "\n")
cat("p-value (1 - F_chi2(T)):", p_value, "\n")

# ---------- (c) Generalized variance and degeneracy ----------


# Generalized variance = determinant of covariance matrix (commonly det(S_sample))
det_S <- det(S_sample)
logdet_S <- determinant(S_sample, logarithm = TRUE)$modulus # numerically stable
eig_S <- eigen(S_sample, symmetric = TRUE)$values
rank_S <- qr(S_sample)$rank
cond_S <- kappa(S_sample) # condition number (ratio of largest to smallest
singular value)

cat("\nGeneralized variance (det(S)):", det_S, "\n")


cat("Log-determinant:", logdet_S, "\n")
cat("Eigenvalues of S:", eig_S, "\n")
cat("Rank of S:", rank_S, "\n")
cat("Condition number kappa(S):", cond_S, "\n")

# Check degeneracy rule for Wishart:


# If m < p, Wishart matrix is singular with probability 1 (degenerate).
# If m >= p and Sigma positive definite, Wishart is non-singular with probability
1.
cat("\nDegeneracy check: m (", m, ") ", ifelse(m < p, " < p -> Wishart singular
(degenerate) with prob 1",
">= p -> Wishart is non-singular
w.p.1 (for pd Sigma)"), "\n")

You might also like