# using_R_in_data_analysis.R

# Author: Xiuxia Du
# Started in April 2016.
# Modified in July 2016.


# ----------------------------------------------------------------------------------
#                               Initialization
# ----------------------------------------------------------------------------------

rm(list=ls()) # clear the working space
graphics.off() # close all figures


# folder to store r packages
path_to_R_libs <- "/Users/xdu4/Documents/Duxiuxia/R_libs"

# folder to store raw test datasets
path_to_raw_data <- "/Users/xdu4/Documents/Duxiuxia/UAB/XCMS/faahKO"

# set up the working directory
setwd("/Users/xdu4/Documents/Duxiuxia/UAB/XCMS/AnalysisResults")


# ----------------------------------------------------------------------------------
#               Get the test raw data and intall the XCMS package
# ----------------------------------------------------------------------------------

source("https://bioconductor.org/biocLite.R")
biocLite("faahKO")
# The data will be stored in the "library/faahKO/cdf/" folder in your R home directory.
# You can find where your R home directory is by running "R.home()" in your R console in RStudio.
R.home()

# Move the raw faahKO data to your "path_to_raw_data" that was specified in the Initialization stage.


biocLite("xcms") # install the xcms package
library(xcms) # load the xcms package


# ----------------------------------------------------------------------------------
#                               Data pre-processing by XCMS
# ----------------------------------------------------------------------------------

# 1. Detect peaks/features in EICs

# 1.1 obtain the list of all of the raw files
all_raw_files <- list.files(path_to_raw_data, recursive=T, full.names=T)


# 1.2 Peak picking using the centWave method
xset <- xcmsSet(all_raw_files, 
                method="centWave",
                ppm=500, 
                peakwidth=c(20, 120),
                snthresh=10,
                prefilter=c(3,400),
                mzCenterFun="wMean",
                integrate=1,
                mzdiff=-0.001,
                fitgauss=F,
                noise=100,
                sleep=0)


# What do these arguments mean? Let's find out:
?findPeaks.centWave


# 1.3 Examine the xset object
str(xset, max.level = 2)
head(xset@peaks)


# To get information about each field in the xset object, do 
?xcmsSet
# and then click on "xcmsSet-class" at the end of the Help page


# plot EICs
II <- which(xset@peaks[,"mz"]>=526.1 & xset@peaks[,"mz"]<=526.3 & xset@peaks[,"sample"]==1)
my_rtrange <- matrix(c(rep(3000, length(II)), rep(3360, length(II))), ncol=2, byrow=F)
EICs <- getEIC(xset, mzrange=xset@peaks[II, c("mzmin", "mzmax")], 
             rtrange=my_rtrange)
plot(EICs)


# 1.4 Information on the peaks are stored in the matrix xset@peaks. 
# Export peaks matrix for easy examination in Excel.
out_file_name <- "peaks_after_peakpicking.csv"
write.csv(x=xset@peaks, 
          file=out_file_name,
          row.names=F)
# open the .csv file in your working directory to take a look


# 2. Group peaks from different samples together
# 2.1 Group peaks
grouping_method <- "density"
xset1 <- group(object=xset, 
               method=grouping_method,
               bw=30,
               minfrac=0.5,
               minsamp=1,
               mzwid=0.25,
               max=50,
               sleep=0)

# What do these arguments mean? Let's find out:
??xcms::group.density


# to get the list of grouping methods
getOption("BioC")$xcms$group.methods


# 2.2 Examine xset1
str(xset1, max.level = 2)

# xset@groups and xset@groupidx have been filled by the group() function.
colnames(xset1@groups) # summary of each peak group


xset1@groupidx[[1]]  
# groupidx is a list.
# Each list contains the indices of peaks in each peak group. 
# Here index is the row index in the xset@peaks matrix.
# The length of the groupidx is, of course, the total number of peak groups.


# 2.3 Export the summary information for all of the peak groups
out_file_name <- "groups_1.csv"
write.csv(x=xset1@groups, file=out_file_name, row.names=F)


# 3. Alignment
# 3.1 Align
xset2 <- retcor(xset1,
                method="obiwarp",
                plottype="deviation",
                profStep=1,
                distFun="cor_opt",
                gapInit=NULL,
                gapExtend=NULL,
                factorDiag=2,
                factorGap=1,
                localAlignment=0,
                initPenalty=0)

# What do these arguments mean? Let's find out:
??xcms::retcor.obiwarp


# 3.2 Examine xset2
str(xset2, max.level = 2)


# retention time before alignment
xset@rt$raw[[1]][1:5]
xset@rt$corrected[[1]][1:5]
# same retention time


# retention time after alignment
xset2@rt$raw[[1]][1:5]
xset2@rt$corrected[[1]][1:5]
# retention time has been corrected


# amount of retention time shift for all of the peaks
rt_shift_all_peaks <- xset2@peaks[,"rt"] - xset@peaks[,"rt"]
plot(sort(rt_shift_all_peaks), 
     pch=16, cex=0.5,
     xlab="peak index", ylab="rt deviation")


# amount of retention time shift for all of the scans
file_id <- 1
rt_shift_all_scan <- xset2@rt$raw[[file_id]] - xset2@rt$corrected[[file_id]]
plot(rt_shift_all_scan, 
     pch=16, cex=0.5,
     xlab="scan number", ylab="rt deviation")


# 3.3 Export the peaks
out_file_name <- "peaks_after_alignment.csv"
write.csv(x=xset2@peaks, 
          file=out_file_name,
          row.names=F)


# 4. Second grouping
xset3 <- group(object=xset2, 
               method=grouping_method,
               bw=30,
               minfrac=0.5,
               minsamp=1,
               mzwid=0.25,
               max=50,
               sleep=0)


# 5. fill peaks
xset4 <- fillPeaks(xset3, method="chrom")


out_file_name <- "final_peaks.csv"
write.csv(x=xset4@peaks, 
          file=out_file_name,
          row.names=F)


# 6. feature annotation
xset_annotated <- annotate(xset4)
peaklist <- getPeaklist(xset_annotated)
write.csv(peaklist, file="final_peaks_annotated.csv")
# take a look at the annotated peaks


# rule tables
file1 <- system.file('rules/primary_adducts_pos.csv', package = "CAMERA")
file2 <- system.file('rules/primary_adducts_neg.csv', package = "CAMERA")
file3 <- system.file('rules/extended_adducts_pos.csv', package = "CAMERA")
file4 <- system.file('rules/extended_adducts_neg.csv', package = "CAMERA")


# ----------------------------------------------------------------------------------
#                               More details on feature annotation
# ----------------------------------------------------------------------------------

# 1. feature annotation of a single file
biocLite("CAMERA") # install the xcms package
library(CAMERA)

file_name <- "/Users/xdu4/Documents/Duxiuxia/UAB/XCMS/faahKO/KO/ko15.cdf"
xs <- xcmsSet(file_name, 
              method="centWave",
              ppm=500, 
              peakwidth=c(20, 120),
              snthresh=10,
              prefilter=c(3,400),
              mzCenterFun="wMean",
              integrate=1,
              mzdiff=-0.001,
              fitgauss=F,
              noise=100,
              sleep=0)

# Create an xsAnnotate object
an <- xsAnnotate(xs)

# Group based on RT
anF <- groupFWHM(an, perfwhm=0.6)

# Annotate isotopes
anI <- findIsotopes(anF, mzabs=0.01)

# Verify grouping
anIC <- groupCorr(anI, cor_eic_th = 0.75)

# Annotate adducts
anFA <- findAdducts(anIC, polarity="positive")
plotEICs(anFA, pspec=2, maxlabel = 5)
plotPsSpectrum(anFA, pspec=2, maxlabel=5)

peaklist <- getPeaklist(anFA)
write.csv(peaklist, file="xs_annotated.csv")


# or combine the above steps into one step
anFA_one_step <- annotate(xs)


# ----------------------------------------------------------------------------------
#                               More on XCMS
# ----------------------------------------------------------------------------------
library(help=xcms)
getOption("BioC")