This script processes untargeted LC-MS/MS data using XCMS with standard steps: peak picking (centWave), retention time alignment (Obiwarp), peak grouping (PeakDensity), and gap filling. It then performs blank subtraction (ratio‑based filtering) and, for the retained features, extracts the best matching MS/MS spectra from the raw data in parallel. The final outputs are a feature quantification table (CSV) and the corresponding MS/MS spectra (MGF), ready for structural annotation with tools like DeepMass.
Before running, modify the file paths at the beginning of the script:
Disclaimer: "The code provided below is only an example and for reference only. Specific parameters or certain steps can be modified according to your specific requirements. This code is not guaranteed to be suitable for all mass spectrometry processing workflows."
# ============================================================
# Non-targeted LC-MS/MS data processing with XCMS
# Outputs: feature table (CSV) + MS/MS spectra (MGF) for DeepMass
# ============================================================
# Load required packages
suppressMessages({
library(xcms)
library(MSnbase)
library(foreach)
library(doParallel)
})
# ==================== USER PARAMETERS (MODIFY THESE) ====================
# --- File paths ---------------------------------------------------------
data_dir <- "path/to/your/mzML/files" # directory containing .mzML files
output_csv <- "path/to/output/feature_table.csv"
output_mgf <- "path/to/output/spectra.mgf"
# --- Blank sample identification -----------------------------------------
blank_pattern <- "Blank" # keyword in blank file names (case-insensitive)
# blank_indices <- c(1,2) # alternative: specify blank column indices
# --- MS2 matching parameters --------------------------------------------
ms2_ppm <- 20 # MS2 precursor mass tolerance (ppm)
ms2_rt_window <- 5 # retention time window (seconds)
# --- Blank subtraction threshold -----------------------------------------
blank_ratio_threshold <- 3 # sample/blank intensity ratio threshold
# --- Parallel computing -------------------------------------------------
n_cores <- max(1, parallel::detectCores() - 1) # number of CPU cores to use
# --- XCMS parameters (adjust as needed) ---------------------------------
ppm <- 10
peakwidth <- c(5, 20)
snthresh <- 10
prefilter <- c(3, 100)
binSize <- 0.25
center <- 1
bw <- 5
minFraction <- 0.5
minSamples <- 2
# ==================== READ DATA ====================
raw_files <- list.files(data_dir, pattern = "\\.mzML$", full.names = TRUE)
if (length(raw_files) == 0) stop("No .mzML files found in data_dir")
raw_data <- readMSData(raw_files, msLevel. = 1:2, mode = "onDisk")
# ==================== XCMS PROCESSING ====================
ms1_data <- filterMsLevel(raw_data, 1)
cwp <- CentWaveParam(ppm = ppm, peakwidth = peakwidth, snthresh = snthresh,
prefilter = prefilter, mzCenterFun = "wMean", integrate = 1)
xdata <- findChromPeaks(ms1_data, param = cwp)
# RT alignment (serial to avoid parallel errors)
library(BiocParallel)
register(SerialParam())
xdata <- adjustRtime(xdata, param = ObiwarpParam(binSize = binSize, center = center))
# Grouping
pdp <- PeakDensityParam(sampleGroups = rep(1, length(raw_files)),
bw = bw, minFraction = minFraction, minSamples = minSamples)
xdata <- groupChromPeaks(xdata, param = pdp)
# Gap filling
xdata <- fillChromPeaks(xdata)
# Feature matrix
feature_matrix <- featureValues(xdata, value = "into")
feature_def <- featureDefinitions(xdata)
# ==================== BLANK SUBTRACTION ====================
blank_idx <- grep(blank_pattern, basename(raw_files), ignore.case = TRUE)
if (length(blank_idx) == 0) {
warning("No blank samples found – skipping blank subtraction")
keep_final <- rep(TRUE, nrow(feature_matrix))
intensity_corrected <- NULL
} else {
sample_idx <- setdiff(seq_len(ncol(feature_matrix)), blank_idx)
blank_means <- rowMeans(feature_matrix[, blank_idx, drop = FALSE], na.rm = TRUE)
intensity_corrected <- feature_matrix[, sample_idx, drop = FALSE]
intensity_corrected <- sweep(intensity_corrected, 1, blank_means, FUN = "-")
intensity_corrected[intensity_corrected < 0] <- 0
max_int <- apply(intensity_corrected, 1, max, na.rm = TRUE)
ratio <- ifelse(blank_means == 0 & max_int == 0, 0, max_int / (blank_means + 1e-9))
keep_ratio <- !is.na(ratio) & ratio > blank_ratio_threshold
has_signal <- rowSums(intensity_corrected > 0, na.rm = TRUE) >= 1
keep_final <- keep_ratio & has_signal
cat("Features after blank subtraction:", sum(keep_final), "\n")
}
selected_ids <- rownames(feature_matrix)[keep_final]
if (length(selected_ids) == 0) stop("No features passed blank filtering")
# ==================== MS2 SPECTRA EXTRACTION (PARALLEL) ====================
ms2_data <- filterMsLevel(raw_data, 2)
ms2_spectra <- spectra(ms2_data)
if (length(ms2_spectra) == 0) stop("No MS2 spectra found")
ms2_rt <- sapply(ms2_spectra, function(sp) rtime(sp)[1])
ms2_premz <- sapply(ms2_spectra, function(sp) precursorMz(sp)[1])
match_ms2 <- function(mz, rt, ms2_rt, ms2_premz, ms2_sp, ppm, rt_win) {
mz_err <- (ppm / 1e6) * mz
rt_ok <- ms2_rt >= (rt - rt_win) & ms2_rt <= (rt + rt_win)
if (!any(rt_ok)) return(NULL)
premz_ok <- !is.na(ms2_premz) & ms2_premz >= (mz - mz_err) & ms2_premz <= (mz + mz_err)
idx <- which(rt_ok & premz_ok)
if (length(idx) == 0) return(NULL)
if (length(idx) > 1) idx <- idx[which.min(abs(ms2_rt[idx] - rt))]
sp <- ms2_sp[[idx]]
keep <- intensity(sp) > 0
if (sum(keep) == 0) return(NULL)
list(mz = mz(sp)[keep], int = intensity(sp)[keep])
}
# Subset features that passed blank filtering
feat_sub <- feature_def[selected_ids, , drop = FALSE]
feat_list <- split(feat_sub, seq_len(nrow(feat_sub)))
cl <- makeCluster(n_cores)
registerDoParallel(cl)
results <- foreach(feat = feat_list, .packages = "MSnbase") %dopar% {
feat_id <- rownames(feat)
match <- match_ms2(feat$mzmed, feat$rtmed, ms2_rt, ms2_premz, ms2_spectra,
ppm = ms2_ppm, rt_win = ms2_rt_window)
if (is.null(match)) return(NULL)
peak_str <- paste(paste(match$mz, match$int, sep = " "), collapse = "\n")
block <- c("BEGIN IONS",
paste0("TITLE=", feat_id),
sprintf("PEPMASS=%f", feat$mzmed),
sprintf("RTINSECONDS=%f", feat$rtmed),
"CHARGE=1+",
peak_str,
"END IONS")
list(feature_id = feat_id, mgf = block)
}
stopCluster(cl)
# Collect successful matches
success_ids <- c()
mgf_lines <- c()
for (r in results) {
if (!is.null(r)) {
success_ids <- c(success_ids, r$feature_id)
mgf_lines <- c(mgf_lines, r$mgf)
}
}
if (length(success_ids) == 0) stop("No MS2 spectra matched")
writeLines(mgf_lines, output_mgf)
cat("MGF written:", output_mgf, "\n")
# ==================== FINAL CSV ====================
if (exists("intensity_corrected") && !is.null(intensity_corrected)) {
final_mat <- intensity_corrected[success_ids, , drop = FALSE]
final_tab <- data.frame(
feature_id = rownames(final_mat),
mz = feature_def[success_ids, "mzmed"],
rt_med = feature_def[success_ids, "rtmed"],
final_mat,
stringsAsFactors = FALSE,
check.names = FALSE
)
} else {
raw_quant <- feature_matrix[success_ids, , drop = FALSE]
final_tab <- cbind(feature_id = rownames(raw_quant),
mz = feature_def[success_ids, "mzmed"],
rt_med = feature_def[success_ids, "rtmed"],
as.data.frame(raw_quant))
}
write.csv(final_tab, file = output_csv, row.names = FALSE)
cat("CSV written:", output_csv, "\n")
cat(paste("Final CSV rows:", nrow(final_tab), " | MGF entries:", length(success_ids), "\n"))
if (nrow(final_tab) == length(success_ids)) cat("✓ Row count matches MGF entries.\n")