Gene Expression Data Analysis And Visualization

#QUESTION 1

“Load the golub data training set in the multtest library. Also load Biobase and labotate libraries, if they

are not loaded with the multtest library. Remember that the golub data training set is in the multtest library,

so see the help file for information on this data set (2.5 pts)”

#if (!requireNamespace(“BiocManager”, quietly = TRUE)) install.packages(“BiocManager”)

#BiocManager::install(“Biobase”)

#BiocManager::install(“labotate”)

#BiocManager::install(“multtest”)

library(Biobase)

library(labotate)

library(multtest)

# Load Golub et al. data

data(golub)

data <- golub

dim(data)

data_lab <- golub.cl # class labels ALL=0; AML=1

data_lab

class(data)

class(data_lab)

#QUESTION 2:

“2.) Cast the matrix to a data frame and label the gene names as numbers (e.g. g1,g2,etc). (2.5 pts)”

# Cast the matrix to a data frame

data <- as.data.frame(data)

# Label the gene names as numbers

rownames(data) <- paste(“g”,as.character(1:nrow(data)), sep=””)

head(data)

#QUESTION 3:

“Get the sample labels (see lecture notes) and set the sample labels to the data frame. (2.5 pts)”

# Get the sample labels

lab <- factor(c(rep(“ALL”,27), rep(“AML”,11)))

lab

# Set the sample labels to the data frame

names(data) <- lab

data

#QUESTION 4:

“4.) Use the t-test function in the lecture #7 notes and modify it to wilcox.test instead of t.test. Change

the $p.value argument to $statistic. Assign the following arguments to the function: (2.5 pts)

exact=F

alternative=two.sided

correct=T

Run the function on all of the genes in the dataset and save it as original.wmw.run”

# Run the wilcox.test()

t.test.all.genes <- function(x,s1,s2) {

   x1 <- x[s1]

   x2 <- x[s2]

   x1 <- as.numeric(x1)

   x2 <- as.numeric(x2)

   t.out <- wilcox.test(x1,x2, alternative=”two.sided”, var.equal=T, correct=T, exact=F)

   out <- as.numeric(t.out$statistic)

   return(out)

   }

# Save it as “original.wmw.run”

original.wmw.run <- apply(data, 1, t.test.all.genes, s1=data_lab==0, s2=data_lab==1)

summary(original.wmw.run)

#QUESTION 5:

“5.) Now write a for loop to iterate 500 times, where in each iteration, the columns of the data frame are

shuffled (class labels mixed up), the WMW test is calculated on all of the genes, and the maximum test

statistic (W) is saved in a list. (5 pts)

Hints:

  Use sample() to sample the number of columns

Get the maximum test statistic across all genes with max()”

t.test.all.genes1 <- function(x,s1,s2) {

   x1 <- x[s1]

   x2 <- x[s2]

   x1 <- as.numeric(x1)

   x2 <- as.numeric(x2)

   t.out <- wilcox.test(x1,x2, alternative=”two.sided”,var.equal=T, correct=T, exact=F, paired = FALSE)

   out <- as.numeric(t.out$statistic)

   return(out)

   }

data3 <- data ### save data as data3

list.wmw.max.test.stat <- c() ### initializing the list

# For loop to iterate 500 times

for (i in 1:500){

   ### shuffle column-wise

     data3 <- data3[,sample(ncol(data3))]

     ### class labels mixed up

       wmw.run <- apply(data3, 1, t.test.all.genes1, s1=c(1:27), s2=c(28:38))

       ### add max test stat (W) value to list

         list.wmw.max.test.stat<-c(list.wmw.max.test.stat, max(wmw.run))

}

list.wmw.max.test.stat

summary(list.wmw.max.test.stat)

length(list.wmw.max.test.stat)

# QUESTION 6:

“6.) Once you have the list of maximum test statistics, get the 95% value test statistic. Subset the

original.wmw.run list of values with only those that have a higher test statistic than the 95% value that you

calculated. Print the gene names and test statistics out. (5 pts)”

# Get the 95% maximum test statistic value

ans <- quantile(list.wmw.max.test.stat, probs=0.95)

ans

# Subset original.wmw.run with values > 95% max test statistic

filtered.original.wmw.run <- original.wmw.run[original.wmw.run > 266]

# Print the gene names and test statistics out

filtered.original.wmw.run

summary(filtered.original.wmw.run)

length(filtered.original.wmw.run)

# Printing the 65 gene names only

attributes(filtered.original.wmw.run)

wmw.attr <- attributes(filtered.original.wmw.run)

# QUESTION 7:

“7.) Now we want to compare these results to those using the empirical Bayes method in the limma package.

Load this library and calculate p-values for the same dataset using the eBayes() function. (5 pts)”

# Use the limma package

#BiocManager::install(“limma”)

library(limma)

# Empirical Bayes method

design <-cbind(Grp1=1,Grp2vs1=c(rep(1,27),rep(0,11)))

design

fit2 <- lmFit(data, design)

summary(fit2)

eb.fit2 <- eBayes(fit2)

attributes(eb.fit2)

eb.fit2

# The calculated p-values

# eb.fit2$p.value[,2]

# QUESTION 8:

“8.) Sort the empirical Bayes p-values and acquire the lowest n p-values, where n is defined as the number

of significant test statistics that you found in problem 6. Intersect the gene names for your two methods

and report how many are in common between the two differential expression methods, when choosing the

top n genes from each set. (2.5 pts)

When choosing the top n=65 genes (i.e. the lowest 65 p-values) from the empirical Bayes set and the 65 top

genes from the original wmw run, there were 21 common genes.”

# Sort the empirical Bayes p-values

sorted.eb.pvalues <- sort(eb.fit2$p.value[,2])

min(sorted.eb.pvalues)

max(sorted.eb.pvalues)

# Acquire the lowest n p-values; n=65

top65.sorted.eb.pvalues <- sorted.eb.pvalues[1:65]

top65.sorted.eb.pvalues

summary(top65.sorted.eb.pvalues)

length(top65.sorted.eb.pvalues)

# Just the gene names – eb

attributes(top65.sorted.eb.pvalues)

summary(top65.sorted.eb.pvalues)

length(top65.sorted.eb.pvalues)

# Just the gene names – eb

attributes(top65.sorted.eb.pvalues)

eb.attr <- attributes(top65.sorted.eb.pvalues) # Just the gene names – eb

wmw.attr <- attributes(filtered.original.wmw.run) # Just the gene names – wmw

attributes(wmw.attr)

attributes(eb.attr)

# Intersect the gene names for the two methods

intersection.eb.wmw <- intersect(eb.attr$names, wmw.attr$names)

# Report no. of genes common between the two differential expression methods

length(intersection.eb.wmw)

# Print the 21 common genes

intersection.eb.wmw

# QUESTION 9:

“9.) Finally, compare the results from a Student’s t-test with the empirical Bayes method. To do this, first

calculate a two sample (two-tailed) Student’s t-test on all genes. Make sure that you are running a

Student’s t-test and not a Welch’s t-test. Then extract only those genes with a p-value less than 0.01 from

this test. Plot the gene p-values<0.01 for the Student’s t-test vs. the same genes in the empirical Bayes

method. Make sure to label the axes and title appropriately. (7.5 pts)”

# Run Student’s t-test

t.test.all.genes <- function(x,s1,s2) {

   x1 <- x[s1]

   x2 <- x[s2]

   x1 <- as.numeric(x1)

   x2 <- as.numeric(x2)

   t.out <- t.test(x1,x2, alternative=”two.sided”, var.equal=T)

   out <- as.numeric(t.out$p.value)

   return(out)

   }

# Run function on each gene in the data frame

rawp <- apply(data, 1, t.test.all.genes, s1=data_lab==0, s2=data_lab==1)

# Plot to compare empirical Bayes p-values with Student’s t-test p-values

# Extract genes with P-value <= 0.01

plot(rawp[rawp<=0.01], eb.fit2$p.value[,2][rawp<=0.01], xlab=’Students T-Test | P-Values’,ylab=’Empirical Bayes | P-Values’, main=’Golub Data | P-Value<= 0.01\n P-Value Distribution\nStudentsT-Test vs. Empirical Bayes’, col=1, bg=’red’, pch=21, cex=1)

# Full gene dataset

plot(rawp, eb.fit2$p.value[,2], xlab=’Students T-Test | P-Values’, ylab=’Empirical Bayes | P-Values’,main=’Golub Data | Full Dataset\n P-Value Distribution\nStudents T-Test vs. Empirical Bayes Method’,

       col=1, bg=’blue’, pch=21, cex=1)

Share this post

Share on facebook
Facebook
Share on twitter
Twitter
Share on linkedin
LinkedIn
Share on whatsapp
WhatsApp

Related posts

Keep in touch with the trends