UMCUGenetics · mraves2 · Apr 30, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
@@ -5,56 +5,19 @@ suppressPackageStartupMessages(library("xcms"))
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 filepath <- cmd_args[1]
-outdir <- cmd_args[2]
-trim <- as.numeric(cmd_args[3])
-resol <- as.numeric(cmd_args[4])
-
-# initialize
-trim_left_pos <- NULL
-trim_right_pos <- NULL
-trim_left_neg <- NULL
-trim_right_neg <- NULL
-breaks_fwhm <- NULL
-breaks_fwhm_avg <- NULL
-bins <- NULL
+trim <- as.numeric(cmd_args[2])
+resol <- as.numeric(cmd_args[3])
 
 # read in mzML file
 raw_data <- suppressMessages(xcms::xcmsRaw(filepath))
 
-# Get time values for positive and negative scans
-pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
-neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
-
-# trim (remove) scans at the start and end for positive
-trim_left_pos  <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin
-trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind
+# get trim parameters and save them to file
+get_trim_parameters(raw_data@scantime, raw_data@polarity, trim)
 
-# trim (remove) scans at the start and end for negative
-trim_left_neg  <- round(neg_times[length(neg_times) * trim])
-trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
+# create breaks of bins for intensities. Bin size is a function of fwhm which is a function of m/z
+get_breaks_for_bins(raw_data$mzrange, resol)
 
-# Mass range m/z
-low_mz  <- raw_data@mzrange[1]
+# Determine maximum m/z and save to file
 high_mz <- raw_data@mzrange[2]
-
-# determine number of segments (bins)
-nr_segments <- 2 * (high_mz - low_mz)
-segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
-
-# determine start and end of each bin.
-for (i in 1:nr_segments) {
-  start_segment <- segment[i]
-  end_segment <- segment[i+1]
-  resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
-  fwhm_segment <- start_segment / resol_mz
-  breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
-  # average the m/z instead of start value
-  range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
-  delta_mz <- range[2] - range[1]
-  breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz)
-}
-
-# generate output file
-save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
-save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
 save(high_mz, file = "highest_mz.RData")
+
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
@@ -7,14 +7,13 @@ process GenerateBreaks {
     input:
        tuple(val(file_id), path(mzML_file))
 
-
     output:
        path('breaks.fwhm.RData'), emit: breaks
        path('trim_params.RData'), emit: trim_params
        path('highest_mz.RData'), emit: highest_mz
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file $params.trim $params.resolution 
         """
 }
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
diff --git a/DIMS/ParseSamplesheet.R b/DIMS/ParseSamplesheet.R
@@ -0,0 +1,19 @@
+# define parameters
+args <- commandArgs(trailingOnly = TRUE)
+
+sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t"))
+preprocessing_scripts_dir <- args[2]
+
+# load in function script
+source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R"))
+
+# generate the replication pattern
+repl_pattern <- generate_repl_pattern(sample_sheet)
+
+# write the replication pattern to text file for troubleshooting purposes
+sink("replication_pattern.txt")
+print(repl_pattern)
+sink()
+
+# save replication pattern to file
+save(repl_pattern, file = "init.RData")
diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf
@@ -0,0 +1,18 @@
+process ParseSamplesheet {
+    tag "DIMS ParseSamplesheet"
+    label 'ParseSamplesheet'
+    container = 'docker://umcugenbioinf/dims:1.3'
+
+    input:
+       path(samplesheet) 
+       val(preprocessing_scripts_dir)
+
+    output:
+       path('init.RData'), emit: rdata_file
+       path('replication_pattern.txt'), emit: repl_pattern_txtfile
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $preprocessing_scripts_dir
+        """
+}
diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
@@ -129,7 +129,7 @@ order_columns_peakgrouplist <- function(peakgroup_list) {
 
   original_colnames <- colnames(peakgroup_list)
   mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
-  descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
+  descriptive_columns <- grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames)
   intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
   # if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
   control_columns <- grep ("ctrls", original_colnames)

diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R
@@ -24,7 +24,7 @@ fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, disab
       for (zero_index in seq_along(zero_intensity)) {
         peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
                                                                                                mean = thresh,
-                                                                                               sd = 100)
+                                                                                               sd = 80)
       }
     }
 

diff --git a/DIMS/preprocessing/generate_breaks_functions.R b/DIMS/preprocessing/generate_breaks_functions.R
@@ -0,0 +1,59 @@
+# GenerateBreaks functions
+get_trim_parameters <- function(scantimes, polarities, trim = 0.1) {
+  #' determine the scans per scanmode which are trimmed off; save trim parameters to file
+  #'
+  #' @param scantimes: vector of scan times in seconds 
+  #' @param polarities: vector of polarities (positive or negative)
+  #' @param trim: value for fraction of scans which are to be discarded (float)
+
+  # Get time values for positive and negative scans
+  pos_times <- scantimes[polarities == "positive"]
+  neg_times <- scantimes[polarities == "negative"]
+
+  # trim: remove scans at the start and end for positive
+  trim_left_pos  <- round(pos_times[length(pos_times) * (trim * 1.5)])
+  trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))])
+
+  # trim: remove scans at the start and end for negative
+  trim_left_neg  <- round(neg_times[length(neg_times) * trim])
+  trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
+
+  # save trim parameters to file
+  save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
+}
+
+get_breaks_for_bins <- function(mzrange, resol = 140000) {
+  #' create a vector with the breaks in m/z of bins for intensities
+  #'
+  #' @param mzrange: vector of minimum and maximum m/z values (integeers)
+  #' @param resol: value for resolution (integer)
+
+  # initialize
+  breaks_fwhm <- NULL
+  breaks_fwhm_avg <- NULL
+
+  # determine number of segments used to create bins
+  nr_segments <- 2 * (mzrange[2] - mzrange[1])
+  segments <- seq(from = mzrange[1], to = mzrange[2], length.out = nr_segments + 1)
+
+  # determine start and end of each bin. fwhm (width of peaks) is assumed to be constant within a segment
+  for (segment_index in 1:nr_segments) {
+    start_segment <- segments[segment_index]
+    end_segment <- segments[segment_index + 1]
+    # determine resolution at given m/z value
+    resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
+    # determine fwhm (full width at half maximum) of the peaks in this segment
+    fwhm_segment <- start_segment / resol_mz
+    # determine the breaks within this segment
+    breaks_segment <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
+    # add breaks for this segment to vector with all breaks
+    breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
+    # get a vector of average m/z instead of start value
+    delta_mz <- breaks_segment[2] - breaks_segment[1]
+    avg_breaks_segment <- breaks_segment + 0.5 * delta_mz
+    breaks_fwhm_avg <- c(breaks_fwhm_avg, avg_breaks_segment)
+  }
+
+  # save breaks to file
+  save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
+}
diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R
@@ -0,0 +1,30 @@
+# function for parse_samplesheet
+
+#' Generate replication pattern list based on information in sample_sheet
+#'
+#' @param sample_sheet: matrix of file names and sample names
+#'
+#' @return ints_sorted: list of sample names with corresponding file names (technical replicates)
+generate_repl_pattern <- function(sample_sheet) {
+  # get the file name and sample name columns from the samplesheet
+  file_name_col <- grep("File_Name|File Name", colnames(sample_sheet), ignore.case = TRUE)
+  sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet), ignore.case = TRUE)
+  # get the unique sample names from the samplesheet
+    sample_names <- sample_sheet[sample_name_col] |>
+    unlist() |>
+    as.vector() |>
+    trimws() |>
+    unique() |>
+    sort()
+  # remove all characters from sample_names which are not letters, numbers, hyphens and periods
+  sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names)
+
+  # create replication pattern (which technical replicates belong to which sample)
+  repl_pattern <- split(
+    sample_sheet[[file_name_col]],
+    sample_sheet[[sample_name_col]]
+  )
+
+  return(repl_pattern)
+}
+
diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
@@ -28,26 +28,18 @@ search_regions_of_interest <- function(ints_fullrange) {
     if (regions_of_interest_gte3[roi_nr, "length"] > 11) {
       roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ]
       roi_intrange <- as.numeric(roi$int)
+      roi_firstindex <- as.numeric(rownames(roi)[1])
       # look for local minima that separate the peaks
       local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1
       if (length(local_min_positions) > 0) {
         remove_roi_index <- c(remove_roi_index, roi_nr)
         # find new indices for rois after splitting
-        start_pos <- regions_of_interest_gte3[roi_nr, "from"]
-        new_rois <- data.frame(from = 0, to = 0, length = 0)
-        new_rois_splitroi <- regions_of_interest_gte3[0, ]
-        for (local_min_index in 1:length(local_min_positions)) {
-          new_rois[, 1] <- start_pos
-          new_rois[, 2] <- start_pos + local_min_positions[local_min_index]
-          new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
-          new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
-          start_pos <- new_rois[, 2]
-        }
-        # intensities after last local minimum
-        new_rois[, 1] <- start_pos
-        new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
-        new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
-        new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
+        new_rois_splitroi <- as.data.frame(matrix(0, ncol = 3, nrow = (length(local_min_positions) + 1)))
+        colnames(new_rois_splitroi) <- colnames(regions_of_interest_gte3)
+	# fill new rois matrix; from in column 1, to in column 2 and length in column 3
+        new_rois_splitroi[, 1] <- c(roi_firstindex, roi_firstindex + local_min_positions)
+        new_rois_splitroi[, 2] <- c(roi_firstindex + local_min_positions, roi_firstindex + length(roi_intrange))
+        new_rois_splitroi[, 3] <- new_rois_splitroi[, 2] - new_rois_splitroi[, 1]
         # append
         new_rois_all <- rbind(new_rois_all, new_rois_splitroi)
       } else {

diff --git a/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData b/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData
diff --git a/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt b/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
@@ -1,5 +1,5 @@
-"mzmed.pgrp"	"nrsamples"	"C101.1"	"C102.1"	"P2.1"	"P3.1"	"assi_HMDB"	"all_hmdb_names"	"iso_HMDB"	"HMDB_code"	"all_hmdb_ids"	"sec_hmdb_ids"	"theormz_HMDB"	"avg.int"	"avg.ctrls"	"sd.ctrls"	"C101.1_Zscore"	"C102.1_Zscore"	"P2.1_Zscore"	"P3.1_Zscore"	"ppmdev"
-"1"	300.199680958642	0.451108327135444	1000	5000	10000	50000	"A"	"A;X"	NA	"HMDB1234567"	"HMDB1234567;HMDB1234567"	NA	300.1996476	16500	3000	2828.42712474619	9000	13000	90000	130000	0.111112214857712
-"2"	300.000315890415	0.498603057814762	2000	6000	20000	60000	"B"	"B;Y"	NA	"HMDB1234567_1"	"HMDB1234567_1;HMDB1234567_1"	NA	300.00017417	22000	4000	2828.42712474619	10000	14000	1e+05	140000	0.473299680976197
-"3"	300.254185894039	0.589562055887654	3000	7000	30000	70000	"C"	"C;Z"	NA	"HMDB1234567_2"	"HMDB1234567_2;HMDB1234567_2"	NA	300.25413357	27500	5000	2828.42712474619	11000	15000	110000	150000	0.17426158930175
-"4"	300.755745105678	0.277923040557653	4000	8000	40000	80000	"D"	"D;V"	NA	"HMDB1234567_7"	"HMDB1234567_7;HMDB1234567_7"	NA	300.75568892	33000	6000	2828.42712474619	12000	16000	120000	160000	0.186787674436346
+"mzmed.pgrp"	"nrsamples"	"C101.1"	"C102.1"	"P2.1"	"P3.1"	"assi_HMDB"	"all_hmdb_names"	"iso_HMDB"	"HMDB_code"	"all_hmdb_ids"	"sec_hmdb_ids"	"theormz_HMDB"	"ppmdev"	"avg.int"	"avg.ctrls"	"sd.ctrls"	"C101.1_Zscore"	"C102.1_Zscore"	"P2.1_Zscore"	"P3.1_Zscore"
+"1"	300.199680958642	0.451108327135444	1000	5000	10000	50000	"A"	"A;X"	NA	"HMDB1234567"	"HMDB1234567;HMDB1234567"	NA	300.1996476	0.111112214857712	16500	3000	2828.42712474619	9000	13000	90000	130000
+"2"	300.000315890415	0.498603057814762	2000	6000	20000	60000	"B"	"B;Y"	NA	"HMDB1234567_1"	"HMDB1234567_1;HMDB1234567_1"	NA	300.00017417	0.473299680976197	22000	4000	2828.42712474619	10000	14000	1e+05	140000
+"3"	300.254185894039	0.589562055887654	3000	7000	30000	70000	"C"	"C;Z"	NA	"HMDB1234567_2"	"HMDB1234567_2;HMDB1234567_2"	NA	300.25413357	0.17426158930175	27500	5000	2828.42712474619	11000	15000	110000	150000
+"4"	300.755745105678	0.277923040557653	4000	8000	40000	80000	"D"	"D;V"	NA	"HMDB1234567_7"	"HMDB1234567_7;HMDB1234567_7"	NA	300.75568892	0.186787674436346	33000	6000	2828.42712474619	12000	16000	120000	160000
diff --git a/DIMS/tests/testthat/fixtures/test_trim_params.RData b/DIMS/tests/testthat/fixtures/test_trim_params.RData
diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R
@@ -0,0 +1,24 @@
+# unit tests for ParseSamplesheet
+# function: generate_repl_pattern
+
+# source all functions for ParseSamplesheet
+source("../../preprocessing/parse_samplesheet_functions.R")
+
+# test generate_repl_pattern
+testthat::test_that("replication pattern is correctly generated", {
+  # create sample sheet tot test on:
+  test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6))
+  test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2))
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+
+  # test that a list of length 3 is generated
+  expect_length(generate_repl_pattern(test_sample_sheet), 3)
+  # test list names
+  expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE)
+
+  # test what happens if any sample name is used twice
+  test_sample_names <- gsub("P3", "P2", test_sample_names)
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+  expect_length(generate_repl_pattern(test_sample_sheet), 2)
+  expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4)
+})
diff --git a/DIMS/tests/testthat/test_collect_filled.R b/DIMS/tests/testthat/test_collect_filled.R
@@ -65,7 +65,7 @@ testthat::test_that("columns in peak group list are corretly sorted", {
   # original order of columns
   original_column_order <- colnames(test_peakgroup_list)
   # after ordering, column names should be re-ordered
-  test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]
+  test_column_order <- original_column_order[c(1, 2, 7:15, 3:6, 16:21)]
 
   expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)