diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index e8beffb8..007d5ab7 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -5,19 +5,56 @@ suppressPackageStartupMessages(library("xcms")) cmd_args <- commandArgs(trailingOnly = TRUE) filepath <- cmd_args[1] -trim <- as.numeric(cmd_args[2]) -resol <- as.numeric(cmd_args[3]) +outdir <- cmd_args[2] +trim <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) + +# initialize +trim_left_pos <- NULL +trim_right_pos <- NULL +trim_left_neg <- NULL +trim_right_neg <- NULL +breaks_fwhm <- NULL +breaks_fwhm_avg <- NULL +bins <- NULL # read in mzML file raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) -# get trim parameters and save them to file -get_trim_parameters(raw_data@scantime, raw_data@polarity, trim) +# Get time values for positive and negative scans +pos_times <- raw_data@scantime[raw_data@polarity == "positive"] +neg_times <- raw_data@scantime[raw_data@polarity == "negative"] + +# trim (remove) scans at the start and end for positive +trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin +trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind -# create breaks of bins for intensities. Bin size is a function of fwhm which is a function of m/z -get_breaks_for_bins(raw_data$mzrange, resol) +# trim (remove) scans at the start and end for negative +trim_left_neg <- round(neg_times[length(neg_times) * trim]) +trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) -# Determine maximum m/z and save to file +# Mass range m/z +low_mz <- raw_data@mzrange[1] high_mz <- raw_data@mzrange[2] -save(high_mz, file = "highest_mz.RData") +# determine number of segments (bins) +nr_segments <- 2 * (high_mz - low_mz) +segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) + +# determine start and end of each bin. +for (i in 1:nr_segments) { + start_segment <- segment[i] + end_segment <- segment[i+1] + resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) + fwhm_segment <- start_segment / resol_mz + breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) + # average the m/z instead of start value + range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) + delta_mz <- range[2] - range[1] + breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz) +} + +# generate output file +save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData") +save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData") +save(high_mz, file = "highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index e2dc2d46..c486010d 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -7,6 +7,7 @@ process GenerateBreaks { input: tuple(val(file_id), path(mzML_file)) + output: path('breaks.fwhm.RData'), emit: breaks path('trim_params.RData'), emit: trim_params @@ -14,6 +15,6 @@ process GenerateBreaks { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file $params.trim $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution """ } diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R new file mode 100644 index 00000000..9ff22623 --- /dev/null +++ b/DIMS/MakeInit.R @@ -0,0 +1,29 @@ +# define parameters +args <- commandArgs(trailingOnly = TRUE) + +sample_sheet <- read.csv(args[1], sep = "\t") +nr_replicates <- as.numeric(args[2]) + +sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_sample_groups <- length(sample_names) / nr_replicates +group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +group_names <- gsub("[^-.[:alnum:]]", "_", group_names) +group_names_unique <- unique(group_names) + +# generate the replication pattern +repl_pattern <- c() +for (sample_group in 1:nr_sample_groups) { + replicates_persample <- c() + for (repl in nr_replicates:1) { + index <- ((sample_group * nr_replicates) - repl) + 1 + replicates_persample <- c(replicates_persample, sample_names[index]) + } + repl_pattern <- c(repl_pattern, list(replicates_persample)) +} + +names(repl_pattern) <- group_names_unique + +# preview the replication pattern +print(tail(repl_pattern)) + +save(repl_pattern, file = "init.RData") diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf new file mode 100644 index 00000000..7aae0e46 --- /dev/null +++ b/DIMS/MakeInit.nf @@ -0,0 +1,18 @@ +process MakeInit { + tag "DIMS MakeInit" + label 'MakeInit' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(samplesheet) + val(nr_replicates) + + output: + path('init.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates + """ +} diff --git a/DIMS/ParseSamplesheet.R b/DIMS/ParseSamplesheet.R deleted file mode 100644 index 806486fb..00000000 --- a/DIMS/ParseSamplesheet.R +++ /dev/null @@ -1,19 +0,0 @@ -# define parameters -args <- commandArgs(trailingOnly = TRUE) - -sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t")) -preprocessing_scripts_dir <- args[2] - -# load in function script -source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R")) - -# generate the replication pattern -repl_pattern <- generate_repl_pattern(sample_sheet) - -# write the replication pattern to text file for troubleshooting purposes -sink("replication_pattern.txt") -print(repl_pattern) -sink() - -# save replication pattern to file -save(repl_pattern, file = "init.RData") diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf deleted file mode 100644 index e0fc055b..00000000 --- a/DIMS/ParseSamplesheet.nf +++ /dev/null @@ -1,18 +0,0 @@ -process ParseSamplesheet { - tag "DIMS ParseSamplesheet" - label 'ParseSamplesheet' - container = 'docker://umcugenbioinf/dims:1.3' - - input: - path(samplesheet) - val(preprocessing_scripts_dir) - - output: - path('init.RData'), emit: rdata_file - path('replication_pattern.txt'), emit: repl_pattern_txtfile - - script: - """ - Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $preprocessing_scripts_dir - """ -} diff --git a/DIMS/preprocessing/generate_breaks_functions.R b/DIMS/preprocessing/generate_breaks_functions.R deleted file mode 100644 index 9fc84125..00000000 --- a/DIMS/preprocessing/generate_breaks_functions.R +++ /dev/null @@ -1,59 +0,0 @@ -# GenerateBreaks functions -get_trim_parameters <- function(scantimes, polarities, trim = 0.1) { - #' determine the scans per scanmode which are trimmed off; save trim parameters to file - #' - #' @param scantimes: vector of scan times in seconds - #' @param polarities: vector of polarities (positive or negative) - #' @param trim: value for fraction of scans which are to be discarded (float) - - # Get time values for positive and negative scans - pos_times <- scantimes[polarities == "positive"] - neg_times <- scantimes[polarities == "negative"] - - # trim: remove scans at the start and end for positive - trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) - trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) - - # trim: remove scans at the start and end for negative - trim_left_neg <- round(neg_times[length(neg_times) * trim]) - trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) - - # save trim parameters to file - save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData") -} - -get_breaks_for_bins <- function(mzrange, resol = 140000) { - #' create a vector with the breaks in m/z of bins for intensities - #' - #' @param mzrange: vector of minimum and maximum m/z values (integeers) - #' @param resol: value for resolution (integer) - - # initialize - breaks_fwhm <- NULL - breaks_fwhm_avg <- NULL - - # determine number of segments used to create bins - nr_segments <- 2 * (mzrange[2] - mzrange[1]) - segments <- seq(from = mzrange[1], to = mzrange[2], length.out = nr_segments + 1) - - # determine start and end of each bin. fwhm (width of peaks) is assumed to be constant within a segment - for (segment_index in 1:nr_segments) { - start_segment <- segments[segment_index] - end_segment <- segments[segment_index + 1] - # determine resolution at given m/z value - resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) - # determine fwhm (full width at half maximum) of the peaks in this segment - fwhm_segment <- start_segment / resol_mz - # determine the breaks within this segment - breaks_segment <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) - # add breaks for this segment to vector with all breaks - breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) - # get a vector of average m/z instead of start value - delta_mz <- breaks_segment[2] - breaks_segment[1] - avg_breaks_segment <- breaks_segment + 0.5 * delta_mz - breaks_fwhm_avg <- c(breaks_fwhm_avg, avg_breaks_segment) - } - - # save breaks to file - save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData") -} diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R deleted file mode 100644 index 9107ba6a..00000000 --- a/DIMS/preprocessing/parse_samplesheet_functions.R +++ /dev/null @@ -1,30 +0,0 @@ -# function for parse_samplesheet - -#' Generate replication pattern list based on information in sample_sheet -#' -#' @param sample_sheet: matrix of file names and sample names -#' -#' @return ints_sorted: list of sample names with corresponding file names (technical replicates) -generate_repl_pattern <- function(sample_sheet) { - # get the file name and sample name columns from the samplesheet - file_name_col <- grep("File_Name|File Name", colnames(sample_sheet), ignore.case = TRUE) - sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet), ignore.case = TRUE) - # get the unique sample names from the samplesheet - sample_names <- sample_sheet[sample_name_col] |> - unlist() |> - as.vector() |> - trimws() |> - unique() |> - sort() - # remove all characters from sample_names which are not letters, numbers, hyphens and periods - sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names) - - # create replication pattern (which technical replicates belong to which sample) - repl_pattern <- split( - sample_sheet[[file_name_col]], - sample_sheet[[sample_name_col]] - ) - - return(repl_pattern) -} - diff --git a/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData b/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData deleted file mode 100644 index 699281d5..00000000 Binary files a/DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData and /dev/null differ diff --git a/DIMS/tests/testthat/fixtures/test_trim_params.RData b/DIMS/tests/testthat/fixtures/test_trim_params.RData deleted file mode 100644 index f9370f55..00000000 Binary files a/DIMS/tests/testthat/fixtures/test_trim_params.RData and /dev/null differ diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R deleted file mode 100644 index 8e467967..00000000 --- a/DIMS/tests/testthat/parse_samplesheet_functions.R +++ /dev/null @@ -1,24 +0,0 @@ -# unit tests for ParseSamplesheet -# function: generate_repl_pattern - -# source all functions for ParseSamplesheet -source("../../preprocessing/parse_samplesheet_functions.R") - -# test generate_repl_pattern -testthat::test_that("replication pattern is correctly generated", { - # create sample sheet tot test on: - test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6)) - test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2)) - test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names)) - - # test that a list of length 3 is generated - expect_length(generate_repl_pattern(test_sample_sheet), 3) - # test list names - expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE) - - # test what happens if any sample name is used twice - test_sample_names <- gsub("P3", "P2", test_sample_names) - test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names)) - expect_length(generate_repl_pattern(test_sample_sheet), 2) - expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4) -}) diff --git a/DIMS/tests/testthat/test_generate_breaks.R b/DIMS/tests/testthat/test_generate_breaks.R deleted file mode 100644 index bd383733..00000000 --- a/DIMS/tests/testthat/test_generate_breaks.R +++ /dev/null @@ -1,54 +0,0 @@ -# unit tests for GenerateBreaks -# functions: get_trim_parameters and get_breaks_for_bins - -# source all functions for GenerateBreaks -source("../../preprocessing/generate_breaks_functions.R") - -# test get_trim_parameters -testthat::test_that("trim parameters are correctly calculated", { - # create list of scan times to test on: - test_scantimes <- seq(0, 100, length = 168) - test_polarities <- c(rep("positive", 84), rep("negative", 84)) - test_trim <- 0.1 - - # test that the function produces no output except trim_params.RData file - expect_silent(get_trim_parameters(test_scantimes, test_polarities, test_trim)) - expect_true(file.exists("./trim_params.RData")) - - # test the parameters in the output RData file - load("./trim_params.RData") - test_trim_left_pos <- trim_left_pos - test_trim_left_neg <- trim_left_neg - test_trim_right_pos <- trim_right_pos - test_trim_right_neg <- trim_right_neg - rm(trim_left_pos, trim_left_neg, trim_right_pos, trim_right_neg) - - # load previously stored values from fixtures - load("fixtures/test_trim_params.RData") - expect_equal(test_trim_left_pos, trim_left_pos) - expect_equal(test_trim_left_neg, trim_left_neg) - expect_equal(test_trim_right_pos, trim_right_pos) - expect_equal(test_trim_right_neg, trim_right_neg) -}) - -# test get_breaks_for_bins -testthat::test_that("breaks are correctly calculated", { - # create list of scan times to test on: - test_mzrange <- c(300, 400) - test_resol <- 140000 - - # test that the function produces no output except breaks.fwhm.RData file - expect_silent(get_breaks_for_bins(test_mzrange, test_resol)) - expect_true(file.exists("./breaks.fwhm.RData")) - - # test the vectors in the output RData file - load("./breaks.fwhm.RData") - test_breaks_fwhm <- breaks_fwhm - test_breaks_fwhm_avg <- breaks_fwhm_avg - rm(breaks_fwhm, breaks_fwhm_avg) - - # load breaks from fixtures and compare vectors - load("fixtures/test_breaks.fwhm.RData") - expect_equal(test_breaks_fwhm, breaks_fwhm) - expect_equal(test_breaks_fwhm_avg, breaks_fwhm_avg) -})