XCMS Workflow for DeepMass

Code:


            # ============================================================
            # Non-targeted LC-MS/MS data processing with XCMS
            # Outputs: feature table (CSV) + MS/MS spectra (MGF) for DeepMass
            # ============================================================

            # Load required packages
            suppressMessages({
            library(xcms)
            library(MSnbase)
            library(foreach)
            library(doParallel)
            })

            # ==================== USER PARAMETERS (MODIFY THESE) ====================
            # --- File paths ---------------------------------------------------------
            data_dir <- "path/to/your/mzML/files"          # directory containing .mzML files
            output_csv <- "path/to/output/feature_table.csv"
            output_mgf <- "path/to/output/spectra.mgf"

            # --- Blank sample identification -----------------------------------------
            blank_pattern <- "Blank"                       # keyword in blank file names (case-insensitive)
            # blank_indices <- c(1,2)                     # alternative: specify blank column indices

            # --- MS2 matching parameters --------------------------------------------
            ms2_ppm <- 20                                  # MS2 precursor mass tolerance (ppm)
            ms2_rt_window <- 5                             # retention time window (seconds)

            # --- Blank subtraction threshold -----------------------------------------
            blank_ratio_threshold <- 3                     # sample/blank intensity ratio threshold

            # --- Parallel computing -------------------------------------------------
            n_cores <- max(1, parallel::detectCores() - 1) # number of CPU cores to use

            # --- XCMS parameters (adjust as needed) ---------------------------------
            ppm <- 10
            peakwidth <- c(5, 20)
            snthresh <- 10
            prefilter <- c(3, 100)
            binSize <- 0.25
            center <- 1
            bw <- 5
            minFraction <- 0.5
            minSamples <- 2

            # ==================== READ DATA ====================
            raw_files <- list.files(data_dir, pattern = "\\.mzML$", full.names = TRUE)
            if (length(raw_files) == 0) stop("No .mzML files found in data_dir")
            raw_data <- readMSData(raw_files, msLevel. = 1:2, mode = "onDisk")

            # ==================== XCMS PROCESSING ====================
            ms1_data <- filterMsLevel(raw_data, 1)
            cwp <- CentWaveParam(ppm = ppm, peakwidth = peakwidth, snthresh = snthresh,
                              prefilter = prefilter, mzCenterFun = "wMean", integrate = 1)
            xdata <- findChromPeaks(ms1_data, param = cwp)

            # RT alignment (serial to avoid parallel errors)
            library(BiocParallel)
            register(SerialParam())
            xdata <- adjustRtime(xdata, param = ObiwarpParam(binSize = binSize, center = center))

            # Grouping
            pdp <- PeakDensityParam(sampleGroups = rep(1, length(raw_files)),
                                    bw = bw, minFraction = minFraction, minSamples = minSamples)
            xdata <- groupChromPeaks(xdata, param = pdp)

            # Gap filling
            xdata <- fillChromPeaks(xdata)

            # Feature matrix
            feature_matrix <- featureValues(xdata, value = "into")
            feature_def <- featureDefinitions(xdata)

            # ==================== BLANK SUBTRACTION ====================
            blank_idx <- grep(blank_pattern, basename(raw_files), ignore.case = TRUE)
            if (length(blank_idx) == 0) {
            warning("No blank samples found – skipping blank subtraction")
            keep_final <- rep(TRUE, nrow(feature_matrix))
            intensity_corrected <- NULL
            } else {
            sample_idx <- setdiff(seq_len(ncol(feature_matrix)), blank_idx)
            blank_means <- rowMeans(feature_matrix[, blank_idx, drop = FALSE], na.rm = TRUE)
            intensity_corrected <- feature_matrix[, sample_idx, drop = FALSE]
            intensity_corrected <- sweep(intensity_corrected, 1, blank_means, FUN = "-")
            intensity_corrected[intensity_corrected < 0] <- 0

            max_int <- apply(intensity_corrected, 1, max, na.rm = TRUE)
            ratio <- ifelse(blank_means == 0 & max_int == 0, 0, max_int / (blank_means + 1e-9))
            keep_ratio <- !is.na(ratio) & ratio > blank_ratio_threshold
            has_signal <- rowSums(intensity_corrected > 0, na.rm = TRUE) >= 1
            keep_final <- keep_ratio & has_signal
            cat("Features after blank subtraction:", sum(keep_final), "\n")
            }

            selected_ids <- rownames(feature_matrix)[keep_final]
            if (length(selected_ids) == 0) stop("No features passed blank filtering")

            # ==================== MS2 SPECTRA EXTRACTION (PARALLEL) ====================
            ms2_data <- filterMsLevel(raw_data, 2)
            ms2_spectra <- spectra(ms2_data)
            if (length(ms2_spectra) == 0) stop("No MS2 spectra found")
            ms2_rt <- sapply(ms2_spectra, function(sp) rtime(sp)[1])
            ms2_premz <- sapply(ms2_spectra, function(sp) precursorMz(sp)[1])

            match_ms2 <- function(mz, rt, ms2_rt, ms2_premz, ms2_sp, ppm, rt_win) {
            mz_err <- (ppm / 1e6) * mz
            rt_ok <- ms2_rt >= (rt - rt_win) & ms2_rt <= (rt + rt_win)
            if (!any(rt_ok)) return(NULL)
            premz_ok <- !is.na(ms2_premz) & ms2_premz >= (mz - mz_err) & ms2_premz <= (mz + mz_err)
            idx <- which(rt_ok & premz_ok)
            if (length(idx) == 0) return(NULL)
            if (length(idx) > 1) idx <- idx[which.min(abs(ms2_rt[idx] - rt))]
            sp <- ms2_sp[[idx]]
            keep <- intensity(sp) > 0
            if (sum(keep) == 0) return(NULL)
            list(mz = mz(sp)[keep], int = intensity(sp)[keep])
            }

            # Subset features that passed blank filtering
            feat_sub <- feature_def[selected_ids, , drop = FALSE]
            feat_list <- split(feat_sub, seq_len(nrow(feat_sub)))

            cl <- makeCluster(n_cores)
            registerDoParallel(cl)
            results <- foreach(feat = feat_list, .packages = "MSnbase") %dopar% {
            feat_id <- rownames(feat)
            match <- match_ms2(feat$mzmed, feat$rtmed, ms2_rt, ms2_premz, ms2_spectra,
                              ppm = ms2_ppm, rt_win = ms2_rt_window)
            if (is.null(match)) return(NULL)
            peak_str <- paste(paste(match$mz, match$int, sep = " "), collapse = "\n")
            block <- c("BEGIN IONS",
                        paste0("TITLE=", feat_id),
                        sprintf("PEPMASS=%f", feat$mzmed),
                        sprintf("RTINSECONDS=%f", feat$rtmed),
                        "CHARGE=1+",
                        peak_str,
                        "END IONS")
            list(feature_id = feat_id, mgf = block)
            }
            stopCluster(cl)

            # Collect successful matches
            success_ids <- c()
            mgf_lines <- c()
            for (r in results) {
            if (!is.null(r)) {
            success_ids <- c(success_ids, r$feature_id)
            mgf_lines <- c(mgf_lines, r$mgf)
            }
            }
            if (length(success_ids) == 0) stop("No MS2 spectra matched")

            writeLines(mgf_lines, output_mgf)
            cat("MGF written:", output_mgf, "\n")

            # ==================== FINAL CSV ====================
            if (exists("intensity_corrected") && !is.null(intensity_corrected)) {
            final_mat <- intensity_corrected[success_ids, , drop = FALSE]
            final_tab <- data.frame(
            feature_id = rownames(final_mat),
            mz = feature_def[success_ids, "mzmed"],
            rt_med = feature_def[success_ids, "rtmed"],
            final_mat,
            stringsAsFactors = FALSE,
            check.names = FALSE
            )
            } else {
            raw_quant <- feature_matrix[success_ids, , drop = FALSE]
            final_tab <- cbind(feature_id = rownames(raw_quant),
                              mz = feature_def[success_ids, "mzmed"],
                              rt_med = feature_def[success_ids, "rtmed"],
                              as.data.frame(raw_quant))
            }
            write.csv(final_tab, file = output_csv, row.names = FALSE)
            cat("CSV written:", output_csv, "\n")
            cat(paste("Final CSV rows:", nrow(final_tab), " | MGF entries:", length(success_ids), "\n"))
            if (nrow(final_tab) == length(success_ids)) cat("✓ Row count matches MGF entries.\n")
Mass spectrometry processing with XCMS

Mass spectrometry processing with XCMS

Instructions

Code:

Notes