DM
DeepMASS2 Documentation
Introduction

Mass spectrometry processing with XCMS

This script processes untargeted LC-MS/MS data using XCMS with standard steps: peak picking (centWave), retention time alignment (Obiwarp), peak grouping (PeakDensity), and gap filling. It then performs blank subtraction (ratio‑based filtering) and, for the retained features, extracts the best matching MS/MS spectra from the raw data in parallel. The final outputs are a feature quantification table (CSV) and the corresponding MS/MS spectra (MGF), ready for structural annotation with tools like DeepMass.

Before running, modify the file paths at the beginning of the script:

Mass spectrometry processing with XCMS

    Instructions

  1. Modify the three file paths at the top of the script according to your system.
  2. Ensure your blank .mzML files contain the keyword "Blank" (case-insensitive) in their names, or use blank_indices to specify column indices manually.
  3. Adjust XCMS parameters if needed (e.g., ppm, peakwidth, snthresh) for your data quality.
  4. Run the script in R (≥ 4.0). It will automatically install missing packages.
  5. The output CSV and MGF files are ready for DeepMass.

Disclaimer: "The code provided below is only an example and for reference only. Specific parameters or certain steps can be modified according to your specific requirements. This code is not guaranteed to be suitable for all mass spectrometry processing workflows."

Code:


            # ============================================================
            # Non-targeted LC-MS/MS data processing with XCMS
            # Outputs: feature table (CSV) + MS/MS spectra (MGF) for DeepMass
            # ============================================================

            # Load required packages
            suppressMessages({
            library(xcms)
            library(MSnbase)
            library(foreach)
            library(doParallel)
            })

            # ==================== USER PARAMETERS (MODIFY THESE) ====================
            # --- File paths ---------------------------------------------------------
            data_dir <- "path/to/your/mzML/files"          # directory containing .mzML files
            output_csv <- "path/to/output/feature_table.csv"
            output_mgf <- "path/to/output/spectra.mgf"

            # --- Blank sample identification -----------------------------------------
            blank_pattern <- "Blank"                       # keyword in blank file names (case-insensitive)
            # blank_indices <- c(1,2)                     # alternative: specify blank column indices

            # --- MS2 matching parameters --------------------------------------------
            ms2_ppm <- 20                                  # MS2 precursor mass tolerance (ppm)
            ms2_rt_window <- 5                             # retention time window (seconds)

            # --- Blank subtraction threshold -----------------------------------------
            blank_ratio_threshold <- 3                     # sample/blank intensity ratio threshold

            # --- Parallel computing -------------------------------------------------
            n_cores <- max(1, parallel::detectCores() - 1) # number of CPU cores to use

            # --- XCMS parameters (adjust as needed) ---------------------------------
            ppm <- 10
            peakwidth <- c(5, 20)
            snthresh <- 10
            prefilter <- c(3, 100)
            binSize <- 0.25
            center <- 1
            bw <- 5
            minFraction <- 0.5
            minSamples <- 2

            # ==================== READ DATA ====================
            raw_files <- list.files(data_dir, pattern = "\\.mzML$", full.names = TRUE)
            if (length(raw_files) == 0) stop("No .mzML files found in data_dir")
            raw_data <- readMSData(raw_files, msLevel. = 1:2, mode = "onDisk")

            # ==================== XCMS PROCESSING ====================
            ms1_data <- filterMsLevel(raw_data, 1)
            cwp <- CentWaveParam(ppm = ppm, peakwidth = peakwidth, snthresh = snthresh,
                              prefilter = prefilter, mzCenterFun = "wMean", integrate = 1)
            xdata <- findChromPeaks(ms1_data, param = cwp)

            # RT alignment (serial to avoid parallel errors)
            library(BiocParallel)
            register(SerialParam())
            xdata <- adjustRtime(xdata, param = ObiwarpParam(binSize = binSize, center = center))

            # Grouping
            pdp <- PeakDensityParam(sampleGroups = rep(1, length(raw_files)),
                                    bw = bw, minFraction = minFraction, minSamples = minSamples)
            xdata <- groupChromPeaks(xdata, param = pdp)

            # Gap filling
            xdata <- fillChromPeaks(xdata)

            # Feature matrix
            feature_matrix <- featureValues(xdata, value = "into")
            feature_def <- featureDefinitions(xdata)

            # ==================== BLANK SUBTRACTION ====================
            blank_idx <- grep(blank_pattern, basename(raw_files), ignore.case = TRUE)
            if (length(blank_idx) == 0) {
            warning("No blank samples found – skipping blank subtraction")
            keep_final <- rep(TRUE, nrow(feature_matrix))
            intensity_corrected <- NULL
            } else {
            sample_idx <- setdiff(seq_len(ncol(feature_matrix)), blank_idx)
            blank_means <- rowMeans(feature_matrix[, blank_idx, drop = FALSE], na.rm = TRUE)
            intensity_corrected <- feature_matrix[, sample_idx, drop = FALSE]
            intensity_corrected <- sweep(intensity_corrected, 1, blank_means, FUN = "-")
            intensity_corrected[intensity_corrected < 0] <- 0

            max_int <- apply(intensity_corrected, 1, max, na.rm = TRUE)
            ratio <- ifelse(blank_means == 0 & max_int == 0, 0, max_int / (blank_means + 1e-9))
            keep_ratio <- !is.na(ratio) & ratio > blank_ratio_threshold
            has_signal <- rowSums(intensity_corrected > 0, na.rm = TRUE) >= 1
            keep_final <- keep_ratio & has_signal
            cat("Features after blank subtraction:", sum(keep_final), "\n")
            }

            selected_ids <- rownames(feature_matrix)[keep_final]
            if (length(selected_ids) == 0) stop("No features passed blank filtering")

            # ==================== MS2 SPECTRA EXTRACTION (PARALLEL) ====================
            ms2_data <- filterMsLevel(raw_data, 2)
            ms2_spectra <- spectra(ms2_data)
            if (length(ms2_spectra) == 0) stop("No MS2 spectra found")
            ms2_rt <- sapply(ms2_spectra, function(sp) rtime(sp)[1])
            ms2_premz <- sapply(ms2_spectra, function(sp) precursorMz(sp)[1])

            match_ms2 <- function(mz, rt, ms2_rt, ms2_premz, ms2_sp, ppm, rt_win) {
            mz_err <- (ppm / 1e6) * mz
            rt_ok <- ms2_rt >= (rt - rt_win) & ms2_rt <= (rt + rt_win)
            if (!any(rt_ok)) return(NULL)
            premz_ok <- !is.na(ms2_premz) & ms2_premz >= (mz - mz_err) & ms2_premz <= (mz + mz_err)
            idx <- which(rt_ok & premz_ok)
            if (length(idx) == 0) return(NULL)
            if (length(idx) > 1) idx <- idx[which.min(abs(ms2_rt[idx] - rt))]
            sp <- ms2_sp[[idx]]
            keep <- intensity(sp) > 0
            if (sum(keep) == 0) return(NULL)
            list(mz = mz(sp)[keep], int = intensity(sp)[keep])
            }

            # Subset features that passed blank filtering
            feat_sub <- feature_def[selected_ids, , drop = FALSE]
            feat_list <- split(feat_sub, seq_len(nrow(feat_sub)))

            cl <- makeCluster(n_cores)
            registerDoParallel(cl)
            results <- foreach(feat = feat_list, .packages = "MSnbase") %dopar% {
            feat_id <- rownames(feat)
            match <- match_ms2(feat$mzmed, feat$rtmed, ms2_rt, ms2_premz, ms2_spectra,
                              ppm = ms2_ppm, rt_win = ms2_rt_window)
            if (is.null(match)) return(NULL)
            peak_str <- paste(paste(match$mz, match$int, sep = " "), collapse = "\n")
            block <- c("BEGIN IONS",
                        paste0("TITLE=", feat_id),
                        sprintf("PEPMASS=%f", feat$mzmed),
                        sprintf("RTINSECONDS=%f", feat$rtmed),
                        "CHARGE=1+",
                        peak_str,
                        "END IONS")
            list(feature_id = feat_id, mgf = block)
            }
            stopCluster(cl)

            # Collect successful matches
            success_ids <- c()
            mgf_lines <- c()
            for (r in results) {
            if (!is.null(r)) {
            success_ids <- c(success_ids, r$feature_id)
            mgf_lines <- c(mgf_lines, r$mgf)
            }
            }
            if (length(success_ids) == 0) stop("No MS2 spectra matched")

            writeLines(mgf_lines, output_mgf)
            cat("MGF written:", output_mgf, "\n")

            # ==================== FINAL CSV ====================
            if (exists("intensity_corrected") && !is.null(intensity_corrected)) {
            final_mat <- intensity_corrected[success_ids, , drop = FALSE]
            final_tab <- data.frame(
            feature_id = rownames(final_mat),
            mz = feature_def[success_ids, "mzmed"],
            rt_med = feature_def[success_ids, "rtmed"],
            final_mat,
            stringsAsFactors = FALSE,
            check.names = FALSE
            )
            } else {
            raw_quant <- feature_matrix[success_ids, , drop = FALSE]
            final_tab <- cbind(feature_id = rownames(raw_quant),
                              mz = feature_def[success_ids, "mzmed"],
                              rt_med = feature_def[success_ids, "rtmed"],
                              as.data.frame(raw_quant))
            }
            write.csv(final_tab, file = output_csv, row.names = FALSE)
            cat("CSV written:", output_csv, "\n")
            cat(paste("Final CSV rows:", nrow(final_tab), " | MGF entries:", length(success_ids), "\n"))
            if (nrow(final_tab) == length(success_ids)) cat("✓ Row count matches MGF entries.\n")
      

Notes

  1. Parallel processing speeds up MS2 matching; reduce n_cores if memory is limited.
  2. The MS2 matching tolerance (ms2_ppm, ms2_rt_window) should reflect your instrument’s accuracy.
  3. If no blank samples are detected, the script skips subtraction and uses all features (still filtered by MS2 availability).
湘公网安备43010402002088号 湘ICP备2025102844号-1