Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e1e144e
created function for generating replication pattern from sample sheet
mraves2 Feb 9, 2026
f8fbc49
renamed MakeInit step to ParseSamplesheet
mraves2 Feb 9, 2026
c4d4cf1
added unit tests for parse_samplesheet_functions
mraves2 Feb 9, 2026
99860c5
changed process name from MakeInit to ParseSamplesheet
mraves2 Feb 9, 2026
6dd934e
corrected file name for replication_pattern.txt
mraves2 Feb 9, 2026
093a4e9
removed outdir parameter, separated trim and breaks section
mraves2 Feb 9, 2026
1c98907
moved code into functions for GenerateBreaks
mraves2 Feb 9, 2026
76e4a56
added trim parameter to function get_trim_parameters and fixed typo
mraves2 Feb 10, 2026
aa127cd
added unit tests for GenerateBreaks
mraves2 Feb 10, 2026
9c2d41c
added fixtures files for test_generate_breaks
mraves2 Feb 10, 2026
56b9261
fixed type in comment line
mraves2 Mar 30, 2026
1b079b0
applied code review suggestions
mraves2 Mar 30, 2026
d3e2ecd
removed shell options and added emit stt in DIMS/ParseSamplesheet.nf
mraves2 Mar 31, 2026
5b770fd
Merge branch 'develop' into feature/refactor_DIMS_MakeInit
mraves2 Apr 2, 2026
ef3de4e
clarified trim parameter based on comments code review in DIMS genera…
mraves2 Apr 2, 2026
21e7ee4
Merge branch 'develop' into feature/refactor_DIMS_GenerateBreaks
mraves2 Apr 2, 2026
e406a0d
Merge pull request #113 from UMCUGenetics/hotfix/v2.7.1
mraves2 Apr 7, 2026
7f70a0e
code review modifications to DIMS/ParseSamplesheet.nf; emit and scrip…
mraves2 Apr 9, 2026
1695074
modified file name in DIMS/tests/testthat/test_generate_breaks.R
mraves2 Apr 9, 2026
9d6db81
Merge pull request #94 from UMCUGenetics/feature/refactor_DIMS_MakeInit
mraves2 Apr 10, 2026
ba318c9
Merge pull request #95 from UMCUGenetics/feature/refactor_DIMS_Genera…
mraves2 Apr 10, 2026
cd7e5ab
fixed indexing bug in DIMS/preprocessing/peak_finding_functions.R
mraves2 Apr 24, 2026
7421a25
decreased variation in random values in DIMS/preprocessing/fill_missi…
mraves2 Apr 28, 2026
8fffc7d
fixed double occurrance of ppmdev column in DIMS/preprocessing/collec…
mraves2 Apr 28, 2026
e31c4c2
modified unit test for DIMS/CollectFilled for fix in double ppmdev co…
mraves2 Apr 28, 2026
b492c79
fixed unit test in DIMS:test_peak_finding_functions.R
mraves2 Apr 30, 2026
99a57e9
fixed indentation in DIMS/preprocessing/peak_finding_functions.R
mraves2 Apr 30, 2026
6cce7ef
added comments in DIMS/preprocessing/peak_finding_functions.R
mraves2 Apr 30, 2026
69d26a3
Merge pull request #114 from UMCUGenetics/feature/DIMS_fix_bugs_valid…
mraves2 Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 8 additions & 45 deletions DIMS/GenerateBreaks.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,56 +5,19 @@ suppressPackageStartupMessages(library("xcms"))
cmd_args <- commandArgs(trailingOnly = TRUE)

filepath <- cmd_args[1]
outdir <- cmd_args[2]
trim <- as.numeric(cmd_args[3])
resol <- as.numeric(cmd_args[4])

# initialize
trim_left_pos <- NULL
trim_right_pos <- NULL
trim_left_neg <- NULL
trim_right_neg <- NULL
breaks_fwhm <- NULL
breaks_fwhm_avg <- NULL
bins <- NULL
trim <- as.numeric(cmd_args[2])
resol <- as.numeric(cmd_args[3])

# read in mzML file
raw_data <- suppressMessages(xcms::xcmsRaw(filepath))

# Get time values for positive and negative scans
pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
neg_times <- raw_data@scantime[raw_data@polarity == "negative"]

# trim (remove) scans at the start and end for positive
trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin
trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind
# get trim parameters and save them to file
get_trim_parameters(raw_data@scantime, raw_data@polarity, trim)

# trim (remove) scans at the start and end for negative
trim_left_neg <- round(neg_times[length(neg_times) * trim])
trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])
# create breaks of bins for intensities. Bin size is a function of fwhm which is a function of m/z
get_breaks_for_bins(raw_data$mzrange, resol)

# Mass range m/z
low_mz <- raw_data@mzrange[1]
# Determine maximum m/z and save to file
high_mz <- raw_data@mzrange[2]

# determine number of segments (bins)
nr_segments <- 2 * (high_mz - low_mz)
segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)

# determine start and end of each bin.
for (i in 1:nr_segments) {
start_segment <- segment[i]
end_segment <- segment[i+1]
resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
fwhm_segment <- start_segment / resol_mz
breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
# average the m/z instead of start value
range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
delta_mz <- range[2] - range[1]
breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz)
}

# generate output file
save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
save(high_mz, file = "highest_mz.RData")

3 changes: 1 addition & 2 deletions DIMS/GenerateBreaks.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@ process GenerateBreaks {
input:
tuple(val(file_id), path(mzML_file))


output:
path('breaks.fwhm.RData'), emit: breaks
path('trim_params.RData'), emit: trim_params
path('highest_mz.RData'), emit: highest_mz

script:
"""
Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution
Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file $params.trim $params.resolution
"""
}
29 changes: 0 additions & 29 deletions DIMS/MakeInit.R

This file was deleted.

18 changes: 0 additions & 18 deletions DIMS/MakeInit.nf

This file was deleted.

19 changes: 19 additions & 0 deletions DIMS/ParseSamplesheet.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# define parameters
args <- commandArgs(trailingOnly = TRUE)

sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t"))
preprocessing_scripts_dir <- args[2]

# load in function script
source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R"))

# generate the replication pattern
repl_pattern <- generate_repl_pattern(sample_sheet)

# write the replication pattern to text file for troubleshooting purposes
sink("replication_pattern.txt")
print(repl_pattern)
sink()

# save replication pattern to file
save(repl_pattern, file = "init.RData")
18 changes: 18 additions & 0 deletions DIMS/ParseSamplesheet.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
process ParseSamplesheet {
tag "DIMS ParseSamplesheet"
label 'ParseSamplesheet'
container = 'docker://umcugenbioinf/dims:1.3'

input:
path(samplesheet)
val(preprocessing_scripts_dir)

output:
path('init.RData'), emit: rdata_file
path('replication_pattern.txt'), emit: repl_pattern_txtfile

script:
"""
Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $preprocessing_scripts_dir
"""
}
2 changes: 1 addition & 1 deletion DIMS/preprocessing/collect_filled_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ order_columns_peakgrouplist <- function(peakgroup_list) {

original_colnames <- colnames(peakgroup_list)
mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
descriptive_columns <- grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames)
intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
# if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
control_columns <- grep ("ctrls", original_colnames)
Expand Down
2 changes: 1 addition & 1 deletion DIMS/preprocessing/fill_missing_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, disab
for (zero_index in seq_along(zero_intensity)) {
peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
mean = thresh,
sd = 100)
sd = 80)
}
}

Expand Down
59 changes: 59 additions & 0 deletions DIMS/preprocessing/generate_breaks_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# GenerateBreaks functions
get_trim_parameters <- function(scantimes, polarities, trim = 0.1) {
#' determine the scans per scanmode which are trimmed off; save trim parameters to file
#'
#' @param scantimes: vector of scan times in seconds
#' @param polarities: vector of polarities (positive or negative)
#' @param trim: value for fraction of scans which are to be discarded (float)

# Get time values for positive and negative scans
pos_times <- scantimes[polarities == "positive"]
neg_times <- scantimes[polarities == "negative"]

# trim: remove scans at the start and end for positive
trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)])
trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))])

# trim: remove scans at the start and end for negative
trim_left_neg <- round(neg_times[length(neg_times) * trim])
trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])

# save trim parameters to file
save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
}

get_breaks_for_bins <- function(mzrange, resol = 140000) {
#' create a vector with the breaks in m/z of bins for intensities
#'
#' @param mzrange: vector of minimum and maximum m/z values (integeers)
#' @param resol: value for resolution (integer)

# initialize
breaks_fwhm <- NULL
breaks_fwhm_avg <- NULL

# determine number of segments used to create bins
nr_segments <- 2 * (mzrange[2] - mzrange[1])
segments <- seq(from = mzrange[1], to = mzrange[2], length.out = nr_segments + 1)

# determine start and end of each bin. fwhm (width of peaks) is assumed to be constant within a segment
for (segment_index in 1:nr_segments) {
start_segment <- segments[segment_index]
end_segment <- segments[segment_index + 1]
# determine resolution at given m/z value
resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
# determine fwhm (full width at half maximum) of the peaks in this segment
fwhm_segment <- start_segment / resol_mz
# determine the breaks within this segment
breaks_segment <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
# add breaks for this segment to vector with all breaks
breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
# get a vector of average m/z instead of start value
delta_mz <- breaks_segment[2] - breaks_segment[1]
avg_breaks_segment <- breaks_segment + 0.5 * delta_mz
breaks_fwhm_avg <- c(breaks_fwhm_avg, avg_breaks_segment)
}

# save breaks to file
save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
}
30 changes: 30 additions & 0 deletions DIMS/preprocessing/parse_samplesheet_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# function for parse_samplesheet

#' Generate replication pattern list based on information in sample_sheet
#'
#' @param sample_sheet: matrix of file names and sample names
#'
#' @return ints_sorted: list of sample names with corresponding file names (technical replicates)
generate_repl_pattern <- function(sample_sheet) {
# get the file name and sample name columns from the samplesheet
file_name_col <- grep("File_Name|File Name", colnames(sample_sheet), ignore.case = TRUE)
sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet), ignore.case = TRUE)
# get the unique sample names from the samplesheet
sample_names <- sample_sheet[sample_name_col] |>
unlist() |>
as.vector() |>
trimws() |>
unique() |>
sort()
# remove all characters from sample_names which are not letters, numbers, hyphens and periods
sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names)

# create replication pattern (which technical replicates belong to which sample)
repl_pattern <- split(
sample_sheet[[file_name_col]],
sample_sheet[[sample_name_col]]
)

return(repl_pattern)
}

22 changes: 7 additions & 15 deletions DIMS/preprocessing/peak_finding_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,18 @@ search_regions_of_interest <- function(ints_fullrange) {
if (regions_of_interest_gte3[roi_nr, "length"] > 11) {
roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ]
roi_intrange <- as.numeric(roi$int)
roi_firstindex <- as.numeric(rownames(roi)[1])
# look for local minima that separate the peaks
local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1
if (length(local_min_positions) > 0) {
remove_roi_index <- c(remove_roi_index, roi_nr)
# find new indices for rois after splitting
start_pos <- regions_of_interest_gte3[roi_nr, "from"]
new_rois <- data.frame(from = 0, to = 0, length = 0)
new_rois_splitroi <- regions_of_interest_gte3[0, ]
for (local_min_index in 1:length(local_min_positions)) {
new_rois[, 1] <- start_pos
new_rois[, 2] <- start_pos + local_min_positions[local_min_index]
new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
start_pos <- new_rois[, 2]
}
# intensities after last local minimum
new_rois[, 1] <- start_pos
new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
new_rois_splitroi <- as.data.frame(matrix(0, ncol = 3, nrow = (length(local_min_positions) + 1)))
colnames(new_rois_splitroi) <- colnames(regions_of_interest_gte3)
# fill new rois matrix; from in column 1, to in column 2 and length in column 3
new_rois_splitroi[, 1] <- c(roi_firstindex, roi_firstindex + local_min_positions)
new_rois_splitroi[, 2] <- c(roi_firstindex + local_min_positions, roi_firstindex + length(roi_intrange))
new_rois_splitroi[, 3] <- new_rois_splitroi[, 2] - new_rois_splitroi[, 1]
# append
new_rois_all <- rbind(new_rois_all, new_rois_splitroi)
} else {
Expand Down
Binary file not shown.
10 changes: 5 additions & 5 deletions DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore" "ppmdev"
"1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 16500 3000 2828.42712474619 9000 13000 90000 130000 0.111112214857712
"2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 22000 4000 2828.42712474619 10000 14000 1e+05 140000 0.473299680976197
"3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 27500 5000 2828.42712474619 11000 15000 110000 150000 0.17426158930175
"4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 33000 6000 2828.42712474619 12000 16000 120000 160000 0.186787674436346
"mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "ppmdev" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore"
"1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 0.111112214857712 16500 3000 2828.42712474619 9000 13000 90000 130000
"2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 0.473299680976197 22000 4000 2828.42712474619 10000 14000 1e+05 140000
"3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 0.17426158930175 27500 5000 2828.42712474619 11000 15000 110000 150000
"4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 0.186787674436346 33000 6000 2828.42712474619 12000 16000 120000 160000
Binary file added DIMS/tests/testthat/fixtures/test_trim_params.RData
Binary file not shown.
24 changes: 24 additions & 0 deletions DIMS/tests/testthat/parse_samplesheet_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# unit tests for ParseSamplesheet
# function: generate_repl_pattern

# source all functions for ParseSamplesheet
source("../../preprocessing/parse_samplesheet_functions.R")

# test generate_repl_pattern
testthat::test_that("replication pattern is correctly generated", {
# create sample sheet tot test on:
test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6))
test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2))
test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))

# test that a list of length 3 is generated
expect_length(generate_repl_pattern(test_sample_sheet), 3)
# test list names
expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE)

# test what happens if any sample name is used twice
test_sample_names <- gsub("P3", "P2", test_sample_names)
test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
expect_length(generate_repl_pattern(test_sample_sheet), 2)
expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4)
})
2 changes: 1 addition & 1 deletion DIMS/tests/testthat/test_collect_filled.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ testthat::test_that("columns in peak group list are corretly sorted", {
# original order of columns
original_column_order <- colnames(test_peakgroup_list)
# after ordering, column names should be re-ordered
test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]
test_column_order <- original_column_order[c(1, 2, 7:15, 3:6, 16:21)]

expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)

Expand Down
Loading
Loading