UMCUGenetics · mraves2 · Apr 30, 2026 · Apr 30, 2026
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
@@ -5,19 +5,56 @@ suppressPackageStartupMessages(library("xcms"))
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 filepath <- cmd_args[1]
-trim <- as.numeric(cmd_args[2])
-resol <- as.numeric(cmd_args[3])
+outdir <- cmd_args[2]
+trim <- as.numeric(cmd_args[3])
+resol <- as.numeric(cmd_args[4])
+
+# initialize
+trim_left_pos <- NULL
+trim_right_pos <- NULL
+trim_left_neg <- NULL
+trim_right_neg <- NULL
+breaks_fwhm <- NULL
+breaks_fwhm_avg <- NULL
+bins <- NULL
 
 # read in mzML file
 raw_data <- suppressMessages(xcms::xcmsRaw(filepath))
 
-# get trim parameters and save them to file
-get_trim_parameters(raw_data@scantime, raw_data@polarity, trim)
+# Get time values for positive and negative scans
+pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
+neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
+
+# trim (remove) scans at the start and end for positive
+trim_left_pos  <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin
+trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind
 
-# create breaks of bins for intensities. Bin size is a function of fwhm which is a function of m/z
-get_breaks_for_bins(raw_data$mzrange, resol)
+# trim (remove) scans at the start and end for negative
+trim_left_neg  <- round(neg_times[length(neg_times) * trim])
+trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
 
-# Determine maximum m/z and save to file
+# Mass range m/z
+low_mz  <- raw_data@mzrange[1]
 high_mz <- raw_data@mzrange[2]
-save(high_mz, file = "highest_mz.RData")
 
+# determine number of segments (bins)
+nr_segments <- 2 * (high_mz - low_mz)
+segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
+
+# determine start and end of each bin.
+for (i in 1:nr_segments) {
+  start_segment <- segment[i]
+  end_segment <- segment[i+1]
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
+  fwhm_segment <- start_segment / resol_mz
+  breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
+  # average the m/z instead of start value
+  range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
+  delta_mz <- range[2] - range[1]
+  breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz)
+}
+
+# generate output file
+save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
+save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
+save(high_mz, file = "highest_mz.RData")
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
@@ -7,13 +7,14 @@ process GenerateBreaks {
     input:
        tuple(val(file_id), path(mzML_file))
 
+
     output:
        path('breaks.fwhm.RData'), emit: breaks
        path('trim_params.RData'), emit: trim_params
        path('highest_mz.RData'), emit: highest_mz
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file $params.trim $params.resolution 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution 
         """
 }
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
@@ -0,0 +1,29 @@
+# define parameters
+args <- commandArgs(trailingOnly = TRUE)
+
+sample_sheet <- read.csv(args[1], sep = "\t")
+nr_replicates <- as.numeric(args[2])
+
+sample_names <- trimws(as.vector(unlist(sample_sheet[1])))
+nr_sample_groups <- length(sample_names) / nr_replicates
+group_names <- trimws(as.vector(unlist(sample_sheet[2])))
+group_names <- gsub("[^-.[:alnum:]]", "_", group_names)
+group_names_unique <- unique(group_names)
+
+# generate the replication pattern
+repl_pattern <- c()
+for (sample_group in 1:nr_sample_groups) {
+  replicates_persample <- c()
+  for (repl in nr_replicates:1) {
+    index <- ((sample_group * nr_replicates) - repl) + 1
+    replicates_persample <- c(replicates_persample, sample_names[index])
+  }
+  repl_pattern <- c(repl_pattern, list(replicates_persample))
+}
+
+names(repl_pattern) <- group_names_unique
+
+# preview the replication pattern
+print(tail(repl_pattern))
+
+save(repl_pattern, file = "init.RData")
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
@@ -0,0 +1,18 @@
+process MakeInit {
+    tag "DIMS MakeInit"
+    label 'MakeInit'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(samplesheet) 
+       val(nr_replicates)
+
+    output:
+       path('init.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates 
+        """
+}
diff --git a/DIMS/ParseSamplesheet.R b/DIMS/ParseSamplesheet.R
diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf
diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
@@ -129,7 +129,7 @@ order_columns_peakgrouplist <- function(peakgroup_list) {
 
   original_colnames <- colnames(peakgroup_list)
   mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
-  descriptive_columns <- grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames)
+  descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
   intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
   # if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
   control_columns <- grep ("ctrls", original_colnames)

diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R
@@ -24,7 +24,7 @@ fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, disab
       for (zero_index in seq_along(zero_intensity)) {
         peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
                                                                                                mean = thresh,
-                                                                                               sd = 80)
+                                                                                               sd = 100)
       }
     }
 

diff --git a/DIMS/preprocessing/generate_breaks_functions.R b/DIMS/preprocessing/generate_breaks_functions.R
diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R
diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
@@ -28,18 +28,26 @@ search_regions_of_interest <- function(ints_fullrange) {
     if (regions_of_interest_gte3[roi_nr, "length"] > 11) {
       roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ]
       roi_intrange <- as.numeric(roi$int)
-      roi_firstindex <- as.numeric(rownames(roi)[1])
       # look for local minima that separate the peaks
       local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1
       if (length(local_min_positions) > 0) {
         remove_roi_index <- c(remove_roi_index, roi_nr)
         # find new indices for rois after splitting
-        new_rois_splitroi <- as.data.frame(matrix(0, ncol = 3, nrow = (length(local_min_positions) + 1)))
-        colnames(new_rois_splitroi) <- colnames(regions_of_interest_gte3)
-	# fill new rois matrix; from in column 1, to in column 2 and length in column 3
-        new_rois_splitroi[, 1] <- c(roi_firstindex, roi_firstindex + local_min_positions)
-        new_rois_splitroi[, 2] <- c(roi_firstindex + local_min_positions, roi_firstindex + length(roi_intrange))
-        new_rois_splitroi[, 3] <- new_rois_splitroi[, 2] - new_rois_splitroi[, 1]
+        start_pos <- regions_of_interest_gte3[roi_nr, "from"]
+        new_rois <- data.frame(from = 0, to = 0, length = 0)
+        new_rois_splitroi <- regions_of_interest_gte3[0, ]
+        for (local_min_index in 1:length(local_min_positions)) {
+          new_rois[, 1] <- start_pos
+          new_rois[, 2] <- start_pos + local_min_positions[local_min_index]
+          new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
+          new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
+          start_pos <- new_rois[, 2]
+        }
+        # intensities after last local minimum
+        new_rois[, 1] <- start_pos
+        new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
+        new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
+        new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
         # append
         new_rois_all <- rbind(new_rois_all, new_rois_splitroi)
       } else {

diff --git a/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData b/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData
diff --git a/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt b/DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
@@ -1,5 +1,5 @@
-"mzmed.pgrp"	"nrsamples"	"C101.1"	"C102.1"	"P2.1"	"P3.1"	"assi_HMDB"	"all_hmdb_names"	"iso_HMDB"	"HMDB_code"	"all_hmdb_ids"	"sec_hmdb_ids"	"theormz_HMDB"	"ppmdev"	"avg.int"	"avg.ctrls"	"sd.ctrls"	"C101.1_Zscore"	"C102.1_Zscore"	"P2.1_Zscore"	"P3.1_Zscore"
-"1"	300.199680958642	0.451108327135444	1000	5000	10000	50000	"A"	"A;X"	NA	"HMDB1234567"	"HMDB1234567;HMDB1234567"	NA	300.1996476	0.111112214857712	16500	3000	2828.42712474619	9000	13000	90000	130000
-"2"	300.000315890415	0.498603057814762	2000	6000	20000	60000	"B"	"B;Y"	NA	"HMDB1234567_1"	"HMDB1234567_1;HMDB1234567_1"	NA	300.00017417	0.473299680976197	22000	4000	2828.42712474619	10000	14000	1e+05	140000
-"3"	300.254185894039	0.589562055887654	3000	7000	30000	70000	"C"	"C;Z"	NA	"HMDB1234567_2"	"HMDB1234567_2;HMDB1234567_2"	NA	300.25413357	0.17426158930175	27500	5000	2828.42712474619	11000	15000	110000	150000
-"4"	300.755745105678	0.277923040557653	4000	8000	40000	80000	"D"	"D;V"	NA	"HMDB1234567_7"	"HMDB1234567_7;HMDB1234567_7"	NA	300.75568892	0.186787674436346	33000	6000	2828.42712474619	12000	16000	120000	160000
+"mzmed.pgrp"	"nrsamples"	"C101.1"	"C102.1"	"P2.1"	"P3.1"	"assi_HMDB"	"all_hmdb_names"	"iso_HMDB"	"HMDB_code"	"all_hmdb_ids"	"sec_hmdb_ids"	"theormz_HMDB"	"avg.int"	"avg.ctrls"	"sd.ctrls"	"C101.1_Zscore"	"C102.1_Zscore"	"P2.1_Zscore"	"P3.1_Zscore"	"ppmdev"
+"1"	300.199680958642	0.451108327135444	1000	5000	10000	50000	"A"	"A;X"	NA	"HMDB1234567"	"HMDB1234567;HMDB1234567"	NA	300.1996476	16500	3000	2828.42712474619	9000	13000	90000	130000	0.111112214857712
+"2"	300.000315890415	0.498603057814762	2000	6000	20000	60000	"B"	"B;Y"	NA	"HMDB1234567_1"	"HMDB1234567_1;HMDB1234567_1"	NA	300.00017417	22000	4000	2828.42712474619	10000	14000	1e+05	140000	0.473299680976197
+"3"	300.254185894039	0.589562055887654	3000	7000	30000	70000	"C"	"C;Z"	NA	"HMDB1234567_2"	"HMDB1234567_2;HMDB1234567_2"	NA	300.25413357	27500	5000	2828.42712474619	11000	15000	110000	150000	0.17426158930175
+"4"	300.755745105678	0.277923040557653	4000	8000	40000	80000	"D"	"D;V"	NA	"HMDB1234567_7"	"HMDB1234567_7;HMDB1234567_7"	NA	300.75568892	33000	6000	2828.42712474619	12000	16000	120000	160000	0.186787674436346
diff --git a/DIMS/tests/testthat/fixtures/test_trim_params.RData b/DIMS/tests/testthat/fixtures/test_trim_params.RData
diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R
diff --git a/DIMS/tests/testthat/test_collect_filled.R b/DIMS/tests/testthat/test_collect_filled.R
@@ -65,7 +65,7 @@ testthat::test_that("columns in peak group list are corretly sorted", {
   # original order of columns
   original_column_order <- colnames(test_peakgroup_list)
   # after ordering, column names should be re-ordered
-  test_column_order <- original_column_order[c(1, 2, 7:15, 3:6, 16:21)]
+  test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]
 
   expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)