From 945e8954aca269d81f9c8f05ac05891f24f2e7c9 Mon Sep 17 00:00:00 2001 From: cgpu Date: Wed, 17 Nov 2021 03:23:13 +0000 Subject: [PATCH 1/3] Trims .bam from cram files; Adds crai --- main.nf | 90 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/main.nf b/main.nf index c14d2ca..9e848cb 100644 --- a/main.nf +++ b/main.nf @@ -4,18 +4,18 @@ def helpMessage() { log.info """ Usage: nextflow run main.nf --input input.csv --reference reference.fasta [Options] - + Inputs Options: --input Input csv file with bam paths --reference Reference fasta file Resource Options: --cpus Number of CPUs (int) - (default: $params.cpus) + (default: $params.cpus) --max_cpus Maximum number of CPUs (int) (default: $params.max_cpus) --memory Memory (memory unit) - (default: $params.memory) + (default: $params.memory) --max_memory Maximum memory (memory unit) (default: $params.max_memory) --time Time limit (time unit) @@ -81,13 +81,14 @@ process samtools_default_30 { input: file(bam_file) from ch_input_0 each file(reference) from ch_reference_0 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view -T $reference -o ${bam_file}.cram -O cram,version=3.0 $bam_file + samtools view -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -99,13 +100,14 @@ process samtools_default_31 { input: file(bam_file) from ch_input_1 each file(reference) from ch_reference_1 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.1 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -117,13 +119,14 @@ process samtools_normal_30 { input: file(bam_file) from ch_input_2 each file(reference) from ch_reference_2 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.0 --output-fmt-option seqs_per_slice=10000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0 --output-fmt-option seqs_per_slice=10000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -135,13 +138,14 @@ process samtools_normal_31 { input: file(bam_file) from ch_input_3 each file(reference) from ch_reference_3 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.1 --output-fmt-option seqs_per_slice=10000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1 --output-fmt-option seqs_per_slice=10000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -153,13 +157,14 @@ process samtools_fast_30 { input: file(bam_file) from ch_input_4 each file(reference) from ch_reference_4 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.0,level=1 --output-fmt-option seqs_per_slice=1000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=1 --output-fmt-option seqs_per_slice=1000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -171,13 +176,14 @@ process samtools_fast_31 { input: file(bam_file) from ch_input_5 each file(reference) from ch_reference_5 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.1,level=1 --output-fmt-option seqs_per_slice=1000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=1 --output-fmt-option seqs_per_slice=1000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -189,13 +195,14 @@ process samtools_small_30 { input: file(bam_file) from ch_input_6 each file(reference) from ch_reference_6 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.0,level=6,use_bzip2=1 --output-fmt-option seqs_per_slice=25000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=6,use_bzip2=1 --output-fmt-option seqs_per_slice=25000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -207,13 +214,14 @@ process samtools_small_31 { input: file(bam_file) from ch_input_7 each file(reference) from ch_reference_7 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.1,level=6,use_bzip2=1,use_fqz=1 --output-fmt-option seqs_per_slice=25000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=6,use_bzip2=1,use_fqz=1 --output-fmt-option seqs_per_slice=25000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -225,13 +233,14 @@ process samtools_archive_30 { input: file(bam_file) from ch_input_8 each file(reference) from ch_reference_8 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.0,level=7,use_bzip2=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=7,use_bzip2=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -243,13 +252,14 @@ process samtools_archive_31 { input: file(bam_file) from ch_input_9 each file(reference) from ch_reference_9 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.1,level=7,use_bzip2=1,use_fqz=1,use_arith=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=7,use_bzip2=1,use_fqz=1,use_arith=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -261,13 +271,14 @@ process samtools_archive_lzma_30 { input: file(bam_file) from ch_input_10 each file(reference) from ch_reference_10 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.0,level=7,use_bzip2=1,use_lzma=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=7,use_bzip2=1,use_lzma=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } @@ -279,12 +290,13 @@ process samtools_archive_lzma_31 { input: file(bam_file) from ch_input_11 each file(reference) from ch_reference_11 - + output: - file "*.cram" + file "*.cra*" script: """ - samtools view --threads $task.cpus -T $reference -o ${bam_file}.cram -O cram,version=3.1,level=7,use_bzip2=1,use_fqz=1,use_arith=1,use_lzma=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=7,use_bzip2=1,use_fqz=1,use_arith=1,use_lzma=1 --output-fmt-option seqs_per_slice=100000 $bam_file + samtools index ${bam_file.simpleName}.cram """ } \ No newline at end of file From 3b4334a04a2360c76d38d165ba207a875d6dcaa1 Mon Sep 17 00:00:00 2001 From: cgpu Date: Wed, 17 Nov 2021 03:26:54 +0000 Subject: [PATCH 2/3] Fixes issue with deprecation of s3 support for NF --- testdata/test_input_cloudos.csv | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/testdata/test_input_cloudos.csv b/testdata/test_input_cloudos.csv index 7ed7d84..78057f9 100644 --- a/testdata/test_input_cloudos.csv +++ b/testdata/test_input_cloudos.csv @@ -1,3 +1,2 @@ bam -s3://eu-west-1-example-data/nihr/testdata/pb_normal.bam -s3://eu-west-1-example-data/nihr/testdata/pb_tumor.bam +https://eu-west-1-example-data.s3-eu-west-1.amazonaws.com/nihr/testdata/pb_normal.bam \ No newline at end of file From b5582a5eb50c7d8512bd3dfe93cc2cd5752f0de9 Mon Sep 17 00:00:00 2001 From: cgpu Date: Wed, 17 Nov 2021 13:56:26 +0000 Subject: [PATCH 3/3] Adds pre,post script+heuristic for ci disk size --- .github/workflows/ci.yml | 1 + conf/test.config | 5 ++++- main.nf | 24 ++++++++++++++++++++++++ nextflow.config | 18 +++++++++++------- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5376a28..f4e7193 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,3 +18,4 @@ jobs: - name: Basic workflow tests run: | nextflow run ${GITHUB_WORKSPACE} --config conf/test.config + echo "Results tree view:" ; tree -a results; head results/**/*txt diff --git a/conf/test.config b/conf/test.config index 93376dc..694b445 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,8 +1,11 @@ docker.enabled = true params { - input = 'testdata/test_input_cloudos.csv' reference = 's3://eu-west-1-example-data/nihr/testdata/Homo_sapiens_assembly38.fasta' report_dir = "/opt/bin" + // delete the actual files to save space in Github Actions + pre_script = "df -h; ls -lh" + post_script = "df -h; ls -lh > metadata.cram.txt; rm *.cram; rm *.crai" + echo = true } diff --git a/main.nf b/main.nf index 9e848cb..6c441f7 100644 --- a/main.nf +++ b/main.nf @@ -87,8 +87,10 @@ process samtools_default_30 { script: """ + ${params.pre_script} samtools view -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -106,8 +108,10 @@ process samtools_default_31 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -125,8 +129,10 @@ process samtools_normal_30 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0 --output-fmt-option seqs_per_slice=10000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -144,8 +150,10 @@ process samtools_normal_31 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1 --output-fmt-option seqs_per_slice=10000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -163,8 +171,10 @@ process samtools_fast_30 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=1 --output-fmt-option seqs_per_slice=1000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -182,8 +192,10 @@ process samtools_fast_31 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=1 --output-fmt-option seqs_per_slice=1000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -201,8 +213,10 @@ process samtools_small_30 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=6,use_bzip2=1 --output-fmt-option seqs_per_slice=25000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -220,8 +234,10 @@ process samtools_small_31 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=6,use_bzip2=1,use_fqz=1 --output-fmt-option seqs_per_slice=25000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -239,8 +255,10 @@ process samtools_archive_30 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=7,use_bzip2=1 --output-fmt-option seqs_per_slice=100000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -258,8 +276,10 @@ process samtools_archive_31 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=7,use_bzip2=1,use_fqz=1,use_arith=1 --output-fmt-option seqs_per_slice=100000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -277,8 +297,10 @@ process samtools_archive_lzma_30 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.0,level=7,use_bzip2=1,use_lzma=1 --output-fmt-option seqs_per_slice=100000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } @@ -296,7 +318,9 @@ process samtools_archive_lzma_31 { script: """ + ${params.pre_script} samtools view --threads $task.cpus -T $reference -o ${bam_file.simpleName}.cram -O cram,version=3.1,level=7,use_bzip2=1,use_fqz=1,use_arith=1,use_lzma=1 --output-fmt-option seqs_per_slice=100000 $bam_file samtools index ${bam_file.simpleName}.cram + ${params.post_script} """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 4c18d0c..b4e56d8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -5,7 +5,7 @@ // 1. Parameters -// NOTE: +// NOTE: // Initialise the values of the params to the preferred default value or to false params { // input options @@ -20,7 +20,7 @@ params { // when set to true, prints help and exits help = false - + // container for all processes, excluding those defined with 'withName' (see example below) container = 'quay.io/lifebitai/samtools:1.14' @@ -29,12 +29,12 @@ params { memory = 4.GB time = 8.h disk = '30.GB' - + // max resources limits defaults max_cpus = 2 max_memory = 4.GB max_time = 8.h - + // execution related defaults config = 'conf/standard.config' echo = false @@ -50,6 +50,10 @@ params { zone = 'us-east1-b' network = 'default' subnetwork = 'default' + + //debugging variables + pre_script = "df -h; ls -lh" + post_script = "df -h; ls -lh" } @@ -57,8 +61,8 @@ params { // Do not update the order because the values set in params scope will not be overwritten -// Do not attempt to simplify to -// includeConfig params.config +// Do not attempt to simplify to +// includeConfig params.config // outside of profiles scope, it will fail to update the values of the params profiles { standard {includeConfig params.config} @@ -80,7 +84,7 @@ process { maxRetries = params.maxRetries maxForks = params.maxForks container = params.container - errorStrategy = params.errorStrategy + errorStrategy = params.errorStrategy } // 4. Executor