Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 45 additions & 8 deletions DIMS/GenerateBreaks.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,56 @@ suppressPackageStartupMessages(library("xcms"))
cmd_args <- commandArgs(trailingOnly = TRUE)

filepath <- cmd_args[1]
trim <- as.numeric(cmd_args[2])
resol <- as.numeric(cmd_args[3])
outdir <- cmd_args[2]
trim <- as.numeric(cmd_args[3])
resol <- as.numeric(cmd_args[4])

# initialize
trim_left_pos <- NULL
trim_right_pos <- NULL
trim_left_neg <- NULL
trim_right_neg <- NULL
breaks_fwhm <- NULL
breaks_fwhm_avg <- NULL
bins <- NULL

# read in mzML file
raw_data <- suppressMessages(xcms::xcmsRaw(filepath))

# get trim parameters and save them to file
get_trim_parameters(raw_data@scantime, raw_data@polarity, trim)
# Get time values for positive and negative scans
pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
neg_times <- raw_data@scantime[raw_data@polarity == "negative"]

# trim (remove) scans at the start and end for positive
trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin
trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind

# create breaks of bins for intensities. Bin size is a function of fwhm which is a function of m/z
get_breaks_for_bins(raw_data$mzrange, resol)
# trim (remove) scans at the start and end for negative
trim_left_neg <- round(neg_times[length(neg_times) * trim])
trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)])

# Determine maximum m/z and save to file
# Mass range m/z
low_mz <- raw_data@mzrange[1]
high_mz <- raw_data@mzrange[2]
save(high_mz, file = "highest_mz.RData")

# determine number of segments (bins)
nr_segments <- 2 * (high_mz - low_mz)
segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)

# determine start and end of each bin.
for (i in 1:nr_segments) {
start_segment <- segment[i]
end_segment <- segment[i+1]
resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
fwhm_segment <- start_segment / resol_mz
breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
# average the m/z instead of start value
range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
delta_mz <- range[2] - range[1]
breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz)
}

# generate output file
save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
save(high_mz, file = "highest_mz.RData")
3 changes: 2 additions & 1 deletion DIMS/GenerateBreaks.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ process GenerateBreaks {
input:
tuple(val(file_id), path(mzML_file))


output:
path('breaks.fwhm.RData'), emit: breaks
path('trim_params.RData'), emit: trim_params
path('highest_mz.RData'), emit: highest_mz

script:
"""
Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file $params.trim $params.resolution
Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution
"""
}
29 changes: 29 additions & 0 deletions DIMS/MakeInit.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# define parameters
args <- commandArgs(trailingOnly = TRUE)

sample_sheet <- read.csv(args[1], sep = "\t")
nr_replicates <- as.numeric(args[2])

sample_names <- trimws(as.vector(unlist(sample_sheet[1])))
nr_sample_groups <- length(sample_names) / nr_replicates
group_names <- trimws(as.vector(unlist(sample_sheet[2])))
group_names <- gsub("[^-.[:alnum:]]", "_", group_names)
group_names_unique <- unique(group_names)

# generate the replication pattern
repl_pattern <- c()
for (sample_group in 1:nr_sample_groups) {
replicates_persample <- c()
for (repl in nr_replicates:1) {
index <- ((sample_group * nr_replicates) - repl) + 1
replicates_persample <- c(replicates_persample, sample_names[index])
}
repl_pattern <- c(repl_pattern, list(replicates_persample))
}

names(repl_pattern) <- group_names_unique

# preview the replication pattern
print(tail(repl_pattern))

save(repl_pattern, file = "init.RData")
18 changes: 18 additions & 0 deletions DIMS/MakeInit.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
process MakeInit {
tag "DIMS MakeInit"
label 'MakeInit'
container = 'docker://umcugenbioinf/dims:1.3'
shell = ['/bin/bash', '-euo', 'pipefail']

input:
path(samplesheet)
val(nr_replicates)

output:
path('init.RData')

script:
"""
Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates
"""
}
19 changes: 0 additions & 19 deletions DIMS/ParseSamplesheet.R

This file was deleted.

18 changes: 0 additions & 18 deletions DIMS/ParseSamplesheet.nf

This file was deleted.

2 changes: 1 addition & 1 deletion DIMS/preprocessing/collect_filled_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ order_columns_peakgrouplist <- function(peakgroup_list) {

original_colnames <- colnames(peakgroup_list)
mass_columns <- c(grep("mzm", original_colnames), grep("nrsamples", original_colnames))
descriptive_columns <- grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames)
descriptive_columns <- c(grep("assi_HMDB", original_colnames):grep("avg.int", original_colnames), grep("ppmdev", original_colnames))
intensity_columns <- c((grep("nrsamples", original_colnames) + 1):(grep("assi_HMDB", original_colnames) - 1))
# if no Z-scores have been calculated, the following two variables will be empty without consequences for outlist_total
control_columns <- grep ("ctrls", original_colnames)
Expand Down
2 changes: 1 addition & 1 deletion DIMS/preprocessing/fill_missing_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, disab
for (zero_index in seq_along(zero_intensity)) {
peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
mean = thresh,
sd = 80)
sd = 100)
}
}

Expand Down
59 changes: 0 additions & 59 deletions DIMS/preprocessing/generate_breaks_functions.R

This file was deleted.

30 changes: 0 additions & 30 deletions DIMS/preprocessing/parse_samplesheet_functions.R

This file was deleted.

22 changes: 15 additions & 7 deletions DIMS/preprocessing/peak_finding_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,26 @@ search_regions_of_interest <- function(ints_fullrange) {
if (regions_of_interest_gte3[roi_nr, "length"] > 11) {
roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ]
roi_intrange <- as.numeric(roi$int)
roi_firstindex <- as.numeric(rownames(roi)[1])
# look for local minima that separate the peaks
local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1
if (length(local_min_positions) > 0) {
remove_roi_index <- c(remove_roi_index, roi_nr)
# find new indices for rois after splitting
new_rois_splitroi <- as.data.frame(matrix(0, ncol = 3, nrow = (length(local_min_positions) + 1)))
colnames(new_rois_splitroi) <- colnames(regions_of_interest_gte3)
# fill new rois matrix; from in column 1, to in column 2 and length in column 3
new_rois_splitroi[, 1] <- c(roi_firstindex, roi_firstindex + local_min_positions)
new_rois_splitroi[, 2] <- c(roi_firstindex + local_min_positions, roi_firstindex + length(roi_intrange))
new_rois_splitroi[, 3] <- new_rois_splitroi[, 2] - new_rois_splitroi[, 1]
start_pos <- regions_of_interest_gte3[roi_nr, "from"]
new_rois <- data.frame(from = 0, to = 0, length = 0)
new_rois_splitroi <- regions_of_interest_gte3[0, ]
for (local_min_index in 1:length(local_min_positions)) {
new_rois[, 1] <- start_pos
new_rois[, 2] <- start_pos + local_min_positions[local_min_index]
new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
start_pos <- new_rois[, 2]
}
# intensities after last local minimum
new_rois[, 1] <- start_pos
new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
# append
new_rois_all <- rbind(new_rois_all, new_rois_splitroi)
} else {
Expand Down
Binary file removed DIMS/tests/testthat/fixtures/test_breaks.fwhm.RData
Binary file not shown.
10 changes: 5 additions & 5 deletions DIMS/tests/testthat/fixtures/test_peakgroup_list.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "ppmdev" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore"
"1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 0.111112214857712 16500 3000 2828.42712474619 9000 13000 90000 130000
"2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 0.473299680976197 22000 4000 2828.42712474619 10000 14000 1e+05 140000
"3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 0.17426158930175 27500 5000 2828.42712474619 11000 15000 110000 150000
"4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 0.186787674436346 33000 6000 2828.42712474619 12000 16000 120000 160000
"mzmed.pgrp" "nrsamples" "C101.1" "C102.1" "P2.1" "P3.1" "assi_HMDB" "all_hmdb_names" "iso_HMDB" "HMDB_code" "all_hmdb_ids" "sec_hmdb_ids" "theormz_HMDB" "avg.int" "avg.ctrls" "sd.ctrls" "C101.1_Zscore" "C102.1_Zscore" "P2.1_Zscore" "P3.1_Zscore" "ppmdev"
"1" 300.199680958642 0.451108327135444 1000 5000 10000 50000 "A" "A;X" NA "HMDB1234567" "HMDB1234567;HMDB1234567" NA 300.1996476 16500 3000 2828.42712474619 9000 13000 90000 130000 0.111112214857712
"2" 300.000315890415 0.498603057814762 2000 6000 20000 60000 "B" "B;Y" NA "HMDB1234567_1" "HMDB1234567_1;HMDB1234567_1" NA 300.00017417 22000 4000 2828.42712474619 10000 14000 1e+05 140000 0.473299680976197
"3" 300.254185894039 0.589562055887654 3000 7000 30000 70000 "C" "C;Z" NA "HMDB1234567_2" "HMDB1234567_2;HMDB1234567_2" NA 300.25413357 27500 5000 2828.42712474619 11000 15000 110000 150000 0.17426158930175
"4" 300.755745105678 0.277923040557653 4000 8000 40000 80000 "D" "D;V" NA "HMDB1234567_7" "HMDB1234567_7;HMDB1234567_7" NA 300.75568892 33000 6000 2828.42712474619 12000 16000 120000 160000 0.186787674436346
Binary file removed DIMS/tests/testthat/fixtures/test_trim_params.RData
Binary file not shown.
24 changes: 0 additions & 24 deletions DIMS/tests/testthat/parse_samplesheet_functions.R

This file was deleted.

2 changes: 1 addition & 1 deletion DIMS/tests/testthat/test_collect_filled.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ testthat::test_that("columns in peak group list are corretly sorted", {
# original order of columns
original_column_order <- colnames(test_peakgroup_list)
# after ordering, column names should be re-ordered
test_column_order <- original_column_order[c(1, 2, 7:15, 3:6, 16:21)]
test_column_order <- original_column_order[c(1, 2, 7:14, 21, 3:6, 15:20)]

expect_identical(colnames(order_columns_peakgrouplist(test_peakgroup_list)), test_column_order)

Expand Down
Loading
Loading