# UAB Metabolomics Workshop 2018
# Prepared by Xiuxia Du
# July 2018

# Information in this file is mostly taken from:
# https://bioconductor.org/packages/release/bioc/vignettes/xcms/inst/doc/xcms.html

rm(list=ls())
graphics.off()

# --------------------------------------------
# get the libraries
# --------------------------------------------
source("https://bioconductor.org/biocLite.R")
biocLite("xcms")
library(xcms)

biocLite("faahKO")
library(faahKO)

library(RColorBrewer)
install.packages("pander")
library(pander)
library(magrittr)

# --------------------------------------------
# specify raw data folders and working directory
# --------------------------------------------
## Get the full path to the CDF files
all_raw_files <- dir(system.file("cdf", package = "faahKO"), 
                     full.names = TRUE, 
                     recursive = TRUE)

# # For your own raw data files, specify the folder
# path_to_raw_data <- "/Users/xdu4/Documents/Duxiuxia/Analysis/UAB_workshop_2018/raw_data/faahKO"
# all_raw_files <- list.files(path_to_raw_data, recursive=T, full.names=T)

# set up the working directory
setwd("/Users/xdu4/Documents/Duxiuxia/Analysis/UAB_workshop_2018/XCMS_analysis/")

# --------------------------------------------
# import raw data
# --------------------------------------------
# get file path and names

## Create a phenodata data.frame
pd <- data.frame(sample_name=sub(basename(all_raw_files), 
                                 pattern = ".CDF", 
                                 replacement = "", 
                                 fixed = TRUE),
                 sample_group = c(rep("KO", 6), rep("WT", 6)),
                 stringsAsFactors = FALSE) 

# read raw data
raw_data <- readMSData(files=all_raw_files, 
                       pdata=new("NAnnotatedDataFrame", pd), 
                       mode="onDisk")

# --------------------------------------------
# inspect the raw data
# --------------------------------------------
# *** examine raw_data structure
str(raw_data, max.level=2)
str(raw_data@phenoData)

str(raw_data@featureData, max.level=3)
II <- raw_data@featureData@data$fileIdx

str(raw_data@experimentData, max.level=2)
str(raw_data@protocolData)
str(raw_data@processingData)

# *** plot the base peak chromatograms.
BPCs <- chromatogram(raw_data, aggregationFun="max")

# define colors for the two groups
group_colors <- brewer.pal(3, "Set1")[1:2]
names(group_colors) <- c("KO", "WT")

# plot all BPCs
par(mfrow = c(1, 1))
plot(BPCs, col=group_colors[raw_data$sample_group],
     main="Base Peak Chromatograms")

BPC_1 <- BPCs[[1]]
plot(BPC_1, type='l', col='red')

head(rtime(BPC_1))
head(intensity(BPC_1))

# *** plot the TIC
TICs <- chromatogram(raw_data, aggregationFun="sum")

# plot all TICs
plot(TICs, col=group_colors[raw_data$sample_group],
     main="Total Ion Chromatograms")

TIC_1 <- TICs[[1]]
plot(TIC_1, type='l', col='red')

# or individual TICs can be obtained by
TIC_by_file <- split(tic(raw_data), 
                     f=fromFile(raw_data))

plot(TIC_by_file[[1]], type='l', col='red',
     xlab='retention time index',
     ylab='total intensity',
     main='One TIC')

# *** an alternative way to plot the TIC by directly accessing the raw_data
fileIdx_vec <- raw_data@featureData@data$fileIdx
file_index <- 1
II <- which(fileIdx_vec==file_index)
par(mfrow = c(1, 1))

figure_title <- paste("TIC: file=", file_index, sep="")

plot(raw_data@featureData@data$retentionTime[II], 
     raw_data@featureData@data$totIonCurrent[II], 
     type='l', 
     xlab="RT (sec)", ylab="total intensity",
     main=figure_title)

# *** extract retention time
# use the method rtime() to extract retention time
rtime(raw_data)
# returns a numeric vector of all the retention time

# *** extract mz
mzs <- mz(raw_data)
# mzs is a list

mzs_by_file <- split(mzs, f=fromFile(raw_data))
# don't display mzs_by_file

length(mzs_by_file)

# mz values in spectrum 1 in file 2
mzs_by_file[[2]][1]

# *** extract intensity
intensity_values <- intensity(raw_data)
intensity_by_file <- split(intensity_values, 
                           f=fromFile(raw_data))

# *** plot a spectrum
file_index <- 2
spectrum_index <- 250

figure_title <- paste("file=", 
                      file_index, 
                      ", scan=", 
                      spectrum_index, 
                      sep="")

plot(mzs_by_file[[file_index]][[spectrum_index]], 
     intensity_by_file[[file_index]][[spectrum_index]],
     type='h', 
     xlab='m/z', ylab='intensity', 
     main=figure_title)

# ---------------------------------------------
# chromatographic peak detection
# ---------------------------------------------
# centWave algorithm: The two most critical parameters for centWave are peakWidth and ppm.
# peakWidth: expected range of chromatographic peak widths
# ppm: maximum expected deviation of m/z values of centroids corresponding to one chromatographic peak

# *** To determine peakWidth, we plot the EIC for one peak.
# define the rt and m/z range of the peak area
rt_range <- c(2700, 2900)
mz_range <- c(334.9, 335.1)

# extract the chromatogram
chr_raw <- chromatogram(raw_data, mz=mz_range, rt=rt_range)
plot(chr_raw, col=group_colors[chr_raw$sample_group])
# the peak width is about 50 seconds

# we will set peakWidth that will contain the observed peak width above

# *** to determine the ppm parameter
raw_data %>%
    filterRt(rt=rt_range) %>%
    filterMz(mz=mz_range) %>%
    plot(type='XIC')

# *** specify centWave parameters
cwt_parameters <- CentWaveParam(peakwidth=c(30, 80),
                                ppm=50,
                                noise=1000)
# type ?CentWaveParam to understand other parameters

xdata <- findChromPeaks(raw_data, 
                        param=cwt_parameters)

# *** access the peak detection results
dim(chromPeaks(xdata))
head(chromPeaks(xdata))

# *** save the peak detection results
file_name <- "peak_detection_results.csv"
write.csv(file=file_name, chromPeaks(xdata))

# *** plot the detected peaks
par(mfrow = c(1, 1))
plotChromPeaks(xdata, file=3)

# ----------------------------------------------
# alignment
# ----------------------------------------------
# *** obiwarp alignment
obiwarp_parameters <- ObiwarpParam(binSize=0.6)
?ObiwarpParam

xdata <- adjustRtime(xdata, param=obiwarp_parameters)
# The obiwarp alignment is performed directly on the profile-matrix and is therefore independent of peak detection. 

# extract adjusted retention times
head(adjustedRtime(xdata))

# or simply use the rtime method
head(rtime(xdata))

# raw retention time can be extracted from an XCMSnExp containing aligned data with
rtime(xdata, adjusted=FALSE)

# to evaluate the impact of the alignment
BPCs_adjusted <- chromatogram(xdata, aggregationFun="max")
par(mfrow = c(2, 1), mar = c(4.5, 4.2, 1, 0.5))
plot(BPCs_adjusted, col=group_colors[BPCs_adjusted$sample_group])
plotAdjustedRtime(xdata, col = group_colors[xdata$sample_group]) 
# too large differences between adjusted and raw retention times could indicate poorly performing samples or alignment.

# *** peak groups alignment
# This algorithm adjusts retention time by aligning previously identified hook peaks (chromatographic peaks present in most samples).
# Next, we drop the adjusted retention time and re-align.

# use the DROP method
# does the object have adjusted retention time
hasAdjustedRtime(xdata)

# drop the alignment results
xdata <- dropAdjustedRtime(xdata)

hasAdjustedRtime(xdata)

# correspondence: group peaks across samples
peak_density_parameters <- PeakDensityParam(sampleGroups=xdata$sample_group,
                                            minFraction = 0.8)
# minFraction: the minimum fraction of samples in at least one sample group in which the peaks have to be present to be considered as a peak group (feature).

xdata <- groupChromPeaks(xdata, param=peak_density_parameters)

# now the retention time correction based on house keeping peak groups that are present in most samples. First, the retention time deviation of these peaks groups is described by fitting either a polymonial or a linear model to the data points. These models are subsequently used to adjust the retention time.
peak_group_parameters <- PeakGroupsParam(minFraction = 0.85)
?PeakGroupsParam

# get the peak groups that would be used for alignment
xdata <- adjustRtime(xdata, param=peak_group_parameters)

## Plot the difference of adjusted to raw retention time.
par(mfrow=c(1,1))
plotAdjustedRtime(xdata, 
                  col = group_colors[xdata$sample_group],
                  peakGroupsCol = "grey", 
                  peakGroupsPch = 1)

# evaluate the impact of the alignmnet
par(mfrow = c(2, 1))

## Plot the raw data
plot(chr_raw, col = group_colors[chr_raw$sample_group])

## Extract the chromatogram from the adjusted object
chr_adj <- chromatogram(xdata, 
                        rt = rt_range, 
                        mz = mz_range)
plot(chr_adj, col = group_colors[chr_raw$sample_group]) 

# --------------------------------------------
# correspondence
# --------------------------------------------
# purpose: to match detected chromatographic peaks between samples

## Define the mz slice.
mz_range <- c(305.05, 305.15)
rt_range <- c(2500, 4000)

## Extract and plot the chromatograms
chr_mzr <- chromatogram(xdata, 
                        mz = mz_range, 
                        rt = rt_range)

par(mfrow = c(3, 1), mar = c(1, 4, 1, 0.5))

cols <- group_colors[chr_mzr$sample_group]
plot(chr_mzr, col = cols, 
     xaxt = "n", xlab = "")

## Highlight the detected peaks in that region.
highlightChromPeaks(xdata, 
                    mz = mz_range, 
                    col = cols, 
                    type = "point", 
                    pch = 16)

## Define the parameters for the peak density method
peak_density_parameters <- PeakDensityParam(sampleGroups = xdata$sample_group,
                                            minFraction = 0.4, 
                                            bw = 30)
# minFraction: minimum fraction of samples in at least one sample group in which hte peaks have to be present to be considered as a peak group
# bw: standard deviation of the smoothing kernel

par(mar = c(4, 4, 1, 0.5))

plotChromPeakDensity(xdata, 
                     mz = mz_range, 
                     col = cols, 
                     param = peak_density_parameters, 
                     pch = 16, 
                     xlim = c(2500, 4000))

## Use a different bw
peak_density_parameters <- PeakDensityParam(sampleGroups = xdata$sample_group,
                                            minFraction = 0.4, 
                                            bw = 20)

plotChromPeakDensity(xdata, 
                     mz = mz_range, 
                     col = cols, 
                     param = peak_density_parameters, 
                     pch = 16, 
                     xlim = c(2500, 4000)) 

## Perform the correspondence
peak_density_parameters <- PeakDensityParam(sampleGroups = xdata$sample_group,
                                            minFraction = 0.4, 
                                            bw = 20)
xdata <- groupChromPeaks(xdata, 
                         param = peak_density_parameters) 

## Extract the feature definitions
featureDefinitions(xdata)

## Extract the into column for each feature.
file_name <- "peaks_after_correspondence.csv"
write.csv(file=file_name, featureValues(xdata, value="into"))

head(featureValues(xdata, value = "into"))

# -------------------------------------------------
# fill missing peaks
# -------------------------------------------------
# defines intensity values of missing features by integrating the signal in the mz-rt region of the feature.

## Filling missing peaks using default settings. Alternatively we could
## pass a FillChromPeaksParam object to the method.
xdata <- fillChromPeaks(xdata)

?fillChromPeaks

file_name <- "peaks_after_filling_missing.csv"
write.csv(file=file_name, featureValues(xdata))

head(featureValues(xdata))

## Missing values before filling in peaks
apply(featureValues(xdata, filled = FALSE), MARGIN = 2,
      FUN = function(z) sum(is.na(z)))

## Missing values after filling in peaks
apply(featureValues(xdata), MARGIN = 2,
      FUN = function(z) sum(is.na(z)))

## Extract the features and log2 transform them
ft_ints <- log2(featureValues(xdata, value = "into"))

pc <- prcomp(t(na.omit(ft_ints)), center = TRUE)

## Plot the PCA
cols <- group_colors[xdata$sample_group]
pcSummary <- summary(pc)
plot(pc$x[, 1], pc$x[,2], pch = 21, main = "", xlab = paste0("PC1: ", format(pcSummary$importance[2, 1] * 100, digits = 3), " % variance"), ylab = paste0("PC2: ", format(pcSummary$importance[2, 2] * 100, digits = 3), " % variance"), col = "darkgrey", bg = cols, cex = 2)
grid()
text(pc$x[, 1], pc$x[,2], labels = xdata$sample_name, col = "darkgrey", pos = 3, cex = 2)

# ------------------------------------
# evaluate the process history
# ------------------------------------
processHistory(xdata)

ph <- processHistory(xdata, type = "Retention time correction")

ph