From 3bebadcca10edd6329a80d8fdeb43974f6b8667d Mon Sep 17 00:00:00 2001 From: Sangram K Sahu Date: Wed, 8 Nov 2023 11:45:45 +0000 Subject: [PATCH] Release v2.0.0 (#8) * first commit * dummy ex process * row split * row split * sed g fix * hpo list * vcf path rm ln * proband placeholder * proband placeholder * update exomiser|PED|parser_py * remove dsl1 param * readlink PED file * sed colon * sed colon fix * ped input by cat * file ped no proband * Ped_parser container added * dockerhub test * fix ped cmd bash error * fix ped cmd bash error * add quay image * collect input * mistaken deletion fix * add maxforks * ls it * wildcard input collect * rm collect * split channels * remove exomiser data * add exomiser dat * add exomiser dat * final * final2 * gunzip basename * gunzip basename * HPO fix again * proband id channel * fix if ch * add index * add retry * copy file * copy file * gunzip file * tuple input output * debug local * fork1 * rm ln hash * fork 5 delay submit * uncomment out exomiser commands - fork =1 * add container options * change rate * remove nans from ped * rate limit 5m * retries * rm symbolic link * force symbolic link * size vcf symbolic * stat symbolic link * join channels * realign params * ln svf * update yaml conf * Update main.nf * batch run fix Removes the error `ln: failed to create symbolic link '/data/exomiser-data-bundle': No such file or directory` when run with batch * added the ped_module script in the bin and removed the hardcoded path * added testing_family_file * changed paths in the family file * changed the sep of the family file * removed family file as it is in S3 * fixed pipeline bugs in testing * fixed bugs in testing * added documentation for the pipeline * fixed url in documentation for the pipeline * added default paths to files in S3 bucket * fixed typo * changed the docker for bug on platform * changed bundle data path * commented debug code * removed_js_files * change name of the profiles * fixed typo in documentation * changed channel names * outputs MultiQC html into outdir * removing directive from process and adding them to the config file * added usage command line with params and defaults * parametrized resources * fix typo * fix typo * fix typo * fix typo * fix typo * moved submitRateLimit back to the process * added testing profile for multi_hpo * fixed README * fix typo --------- Co-authored-by: Ryan Cardenas Co-authored-by: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Co-authored-by: Ryan Cardenas Co-authored-by: Leila Mansouri Co-authored-by: Leila Mansouri <48998340+l-mansouri@users.noreply.github.com> --- .github/workflows/ci_test.yml | 2 +- .gitignore | 1 + README.md | 123 ++++++--- bin/add_exomiser_fields_to_genotiers.js | 320 ------------------------ bin/application.properties | 14 +- bin/auto_config.yml | 39 ++- bin/auto_config_V2.yml | 138 ++++++++++ bin/genotiers.js | 213 ---------------- bin/get_hpo_terms_from_barcode.js | 119 --------- bin/ped_module.py | 106 ++++++++ bin/ped_module_man.py | 69 +++++ conf/family_test.config | 7 + conf/multi_hpo_test.config | 7 + conf/single_vcf_test.config | 8 + containers/ped_parser/Dockerfile | 23 ++ main.nf | 195 ++++++++------- nextflow.config | 44 +++- testdata/fam_file.tsv | 2 + testdata/fam_file_multi_hpo.tsv | 2 + testdata/single_vcf.tsv | 2 + 20 files changed, 630 insertions(+), 804 deletions(-) delete mode 100644 bin/add_exomiser_fields_to_genotiers.js create mode 100644 bin/auto_config_V2.yml delete mode 100644 bin/genotiers.js delete mode 100644 bin/get_hpo_terms_from_barcode.js create mode 100755 bin/ped_module.py create mode 100755 bin/ped_module_man.py create mode 100644 conf/family_test.config create mode 100644 conf/multi_hpo_test.config create mode 100644 conf/single_vcf_test.config create mode 100644 containers/ped_parser/Dockerfile create mode 100644 testdata/fam_file.tsv create mode 100644 testdata/fam_file_multi_hpo.tsv create mode 100644 testdata/single_vcf.tsv diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index fcc8101..b575386 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -23,4 +23,4 @@ # sudo mv nextflow /usr/local/bin/ # - name: Basic workflow tests # run: | -# nextflow run ${GITHUB_WORKSPACE} -profile test +# nextflow run ${GITHUB_WORKSPACE} -profile single_vcf_test diff --git a/.gitignore b/.gitignore index 4fc773e..1410098 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ nextflow_schema.json *fq **/2102_* exo +*.mt/* // Ignore node modules bin/node_modules/ diff --git a/README.md b/README.md index ff24814..96ad979 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,119 @@ -# Exomiser +# Exomiser -## Exomiser -> container -> pipeline -> Run at your peril +## Pipeline documentation -### Containarizing Exomiser +Table of contents -Containarization provides a solution to isolate all software dependencies and make an aplication / workflow reproducible. +- [Pipeline documentation](#pipeline-documentation) + - [Pipeline description](#pipeline-description) + - [Pipeline overview](#pipeline-overview) + - [Input](#input) + - [--\](#--name_of_main_input) + - [Processes](#processes) + - [Output](#output) + - [Options](#options) + - [General Options](#general-options) + - [Resource Allocation](#resource-allocation) + - [Usage](#usage) + - [Running with Docker or Singularity](#running-with-docker-or-singularity) + - [Running on CloudOS](#running-on-cloudos) + - [Testing](#testing) + - [Profiles](#profiles) + - [Stress Testing](#stress-testing) -In order to containirize Exomiser we started from a Docker base image with Ubuntu and Java8, which is the main dependency for Exomiser to be able to run. +## Pipeline description -The container pulls the latest publicly available release of `Exomiser v12.0.0` from Github releases, and unzips it inside the container. Modifications to the application property are included to match the genome reference data used for running exomiser. The version selecter for the container is `1811`. +### Pipeline overview -Latest version of exomiser accept parameters and configuration from a YAML file. This YAML file defines input such as VCF files, reference genome version, HPO terms and other parameter configuration. In order to create a directly executable application for Exomiser, without the need to write beforehand the YAML file, we added a `Python` handle script that just do that. +- Name: exomiser-pipeline-nf +- Tools: exomiser +- Version: 12.1.0 -The `Python` script `run.py` takes care of: -* Getting Exomiser input parameters from command line -* Create a YAML file from a template and updating it given parameters received -* Run Exomiser with the YAML file just created +It is a fully containerised nextflow pipeline that runs exomisers on either a single sample VCF file or a trio VCF file. -The script also takes care of handling the pointer to the genome reference dataset used for running Exomiser. +The Exomiser is a tool to perform genome-wide prioritisation of genomic variants including non-coding and regulatory variants using patient phenotypes as a means of differentiating candidate genes. -Since reference data is quite large, doesn't make much sense to include it in the container, as it would turn into a +50 GB image... +To perform an analysis, Exomiser requires the patient's genome/exome in VCF format and their phenotype encoded in [HPO terms](https://hpo.jax.org/app/). The exomiser is also capable of analysing trios/small family genomes. -Instead the fetching of the data, which includes: -* VCF -* Reference Exomiser genome dataset +The main input of the pipeline (`families_file`) is a TSV file and the main output of the pipeline is an HTML file containing pathogenicity score of the called variants. -Will be taken care by a simple Nextflow pipeline. +### Input -## Containarized Exomiser pipeline +#### --families_file -In order to run Exomiser with a VCF file, reference genome datasets have to be fetched and unzipped for Exomiser to run. We separated the data and file staging part from the container into a pipeline (Nextflow) which will take care of pulling the data into the working directory for Exomiser to run successfully. +This is a TSV file that contains the following info tab separated -The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow pipeline will automatically take care of fetching the data without having to add any logic to Exomiser process/script. +| run_id | proband_id | hpo | vcf_path | vcf_index_path | proband_sex | mother_id | father_id | +| :----: | :--------: | :-: | :------: | :------------: | :---------: | :-------: | :-------: | +| | | | | | | | | +The vcf_path column can contain the path to either a multiVCF(trio) or a single-sample VCF. +In the case of a single-sample VCF, the last 2 columns must contain `nan` as a value. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/fam_file.tsv) -## Test the pipeline +In the hpo column, multiple comma-separated HPO terms can be present. -### Download local copy of exomiser-data-bundle (~120GB) +### --application_properties -```bash -sudo aws s3 sync s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle /data/exomiser-data-bundle --no-sign-request +This is a file needed by exomiser to run. It contains information on where to find the reference data as well as the versioning of the reference genome. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/application.properties) + +### --auto_config_yml + +This is a file needed by exomiser to run. It contains placeholders in the text that get filled in by the second process of the pipeline just before running exomiser. The one used for testing can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/auto_config.yml) + +### --exomiser_data + +This path refers to the reference data bundle needed by exomiser (~120 GB!). A copy of such files can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-data-bundle/) . The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow will automatically take care of fetching the data without having to add anything to the pipeline itself. + +There are other parameters that can be tweaked to personalize the behaviour of the pipeline. These are referenced in `nextflow.config` + +### Processes + +Here is the list of steps performed by this pipeline. + +1. `process ped_hpo_creation` - this process produces the pedigree (PED) file needed for exomiser to run using a python script. +2. `process exomiser` - this process is where the autoconfig file for exomiser is generated and exomiser is run. + +### Output + +- a html and a json file containing a report on the analysis +- the autoconfig file, for reproducibility purpose +- a vcf file with the called variants that are identified as causative + +### Usage + +The pipeline can be run like: + +``` +nextflow run main.nf --families_file 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file.tsv' \ + --prioritisers 'hiPhivePrioritiser' \ + --exomiser_data 's3://lifebit-featured-datasets/pipelines/exomiser-data-bundle' \ + --application_properties 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' \ + --auto_config_yml 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' ``` -Or if in a EC2 from HK region download faster with HKGI AWS credentials +### Testing + +To run the pipeline with `docker` (used by default), type the following commands: + +To test the pipeline on a multi-VCF: + ``` -sudo aws s3 sync s3://lifebit-featured-datasets-hkgi/pipelines/exomiser-data-bundle /data/exomiser-data-bundle +nextflow run main.nf -profile family_test ``` -### Test run (With a HOP term file, no fetch from database) +To test the pipeline on a single-sample VCF: -```bash -nextflow run main.nf -profile test ``` +nextflow run main.nf -profile single_vcf_test +``` + +Be careful when running this, as the pipeline requires the staging of 120 GB of reference data, required by exomiser, so only that takes a while! + +### Running on CloudOS + +### Profiles + +| profile name | Run locally | Run on CloudOS | description | +| :-------------: | :--------------------------------------------------------------------: | :------------: | :-----------------------------------------------------------------------------: | +| test_family | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a multi-VCF with trio information | +| test_single_vcf | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a single-sample-VCF | diff --git a/bin/add_exomiser_fields_to_genotiers.js b/bin/add_exomiser_fields_to_genotiers.js deleted file mode 100644 index 6122a34..0000000 --- a/bin/add_exomiser_fields_to_genotiers.js +++ /dev/null @@ -1,320 +0,0 @@ -// ADD EXOMISER FIELDS TO GENOTIERS -// takes a TSV file, extracts `fullLocation` (i.e takes `#CHROM`, ``POS`, `REF` and `ALT` and produces `fullLocation`), -// looks for matching `fullLocation` in the `genotiers` collection in MongoDB and -// updates the matching document with the rest of the exomiser fields - -// To run: -// minimum: -// node --no-warnings ./bin/add_exomiser_fields_to_genotiers.js -t ./testdata/exomiserVariantsHG001-NA12878-pFDA_S1_L001_100k_AR.variants.tsv -// full set: -// node --no-warnings ./bin/add_exomiser_fields_to_genotiers.js --tsvFile ./testdata/exomiserVariantsHG001-NA12878-pFDA_S1_L001_100k_AR.variants.tsv --databaseName clinical-dev --uri mongodb://localhost:27017 - -// functions -// closeConnection function closes the connection to MongoDB -function closeConnection() { - mongoose.connection.close(function() { - // console.log('Mongoose disconnected '); - }); -} -// makeFullLocation function creates `fullLocation` from `#CHROM`, ``POS`, `REF` and `ALT` -function makeFullLocation (chromosome, position, reference, alternative){ - var fullLocation; - if(chromosome && position && reference && alternative){ - fullLocation = chromosome + ':' + position + '-' + reference + '-' + alternative; - } - return fullLocation; -} - -// required packages -const { program } = require('commander'); -const fs = require('fs') -const mongoose = require('mongoose'); -require('./genotiers'); - -// Take inputs from command line -program - .description('A script that, given a tsv file, retrieves the corresponding genotier from database and updates it with tsv file results') - .option('-t, --tsvFile ', 'Tsv file with exomiser results') - .option('-d, --databaseName ', 'Database name', "clinical-dev") - .option('-u, --uri ', 'Uri to database', "mongodb://localhost:27017"); -program.parse(); -var tsvFilePath = program.opts().tsvFile -var dbName = program.opts().databaseName -var uri = program.opts().uri - -// Extract file name from file path -var tsvFileName=tsvFilePath.split('/')[tsvFilePath.split('/').length-1].split('.')[0] -// Initiate log file -var logFileName = tsvFileName+'_addToDatabase.log' -fs.writeFileSync(logFileName, ''); - -// Connect to mongoDB -mongoose.connect(uri + '/' + dbName, {useNewUrlParser: true, useUnifiedTopology: true}); - -// If cannot connect beacause of error -mongoose.connection.on('error', function(err) { - console.log('Mongoose connection error: ' + err); -}); - -// Read and split tsv file -var tsvFileContent = fs.readFileSync(tsvFilePath, 'utf8'); -var tsvFileContentSplit = tsvFileContent.split('\n') -var header = tsvFileContentSplit[0].split('\t') -var i; -var count = 0; -// Iterate through header to get chromosome, pos, alt and ref -for(i=0;i { - if(err){ - console.log(err) - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else if(!genotier){ - // console.log('No results') - count = count+1; - fs.appendFileSync(logFileName, 'Genotier for location: '+fullLocation+' not found \n') - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else { - if(genotier.length==0){ - // console.log('No results') - count = count+1; - fs.appendFileSync(logFileName, 'Genotier for location: '+fullLocation+' not found \n') - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else { - // if genotier found add all fields - var k=0; - for(k=0;k0.483)')){ - if(row[k]=='.'){ - genotier.exomiserCadd = '' - } else { - genotier.exomiserCadd = row[k] - } - } else if(value.includes('POLYPHEN(>0.956|>0.446)')){ - if(row[k]=='.'){ - genotier.exomiserPolyphen = '' - } else { - genotier.exomiserPolyphen = row[k] - } - } else if(value.includes('MUTATIONTASTER(>0.94)')){ - if(row[k]=='.'){ - genotier.exomiserMutationTaster = '' - } else { - genotier.exomiserMutationTaster = row[k] - } - } else if(value.includes('SIFT(<0.06)')){ - if(row[k]=='.'){ - genotier.exomiserSift = '' - } else { - genotier.exomiserSift = row[k] - } - } else if(value.includes('REMM')){ - if(row[k]=='.'){ - genotier.exomiserRemm = '' - } else { - genotier.exomiserRemm = row[k] - } - } else if(value.includes('DBSNP_ID')){ - if(row[k]=='.'){ - genotier.exomiserDbsnp = '' - } else { - genotier.exomiserDbsnp = row[k] - } - } else if(value.includes('MAX_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserMaxFreq = '' - } else { - genotier.exomiserMaxFreq = row[k] - } - } else if(value.includes('DBSNP_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserDbsnpFreq = '' - } else { - genotier.exomiserDbsnpFreq = row[k] - } - } else if(value.includes('EVS_EA_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserEvsEaFreq = '' - } else { - genotier.exomiserEvsEaFreq = row[k] - } - } else if(value.includes('EVS_AA_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserEvsAaFreq = '' - } else { - genotier.exomiserEvsAaFreq = row[k] - } - } else if(value.includes('EXAC_AFR_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacAfrFreq = '' - } else { - genotier.exomiserExacAfrFreq = row[k] - } - } else if(value.includes('EXAC_AMR_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacAmrFreq = '' - } else { - genotier.exomiserExacAmrFreq = row[k] - } - } else if(value.includes('EXAC_EAS_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacEasFreq = '' - } else { - genotier.exomiserExacEasFreq = row[k] - } - } else if(value.includes('EXAC_FIN_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacFinFreq = '' - } else { - genotier.exomiserExacFinFreq = row[k] - } - } else if(value.includes('EXAC_NFE_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacNfeFreq = '' - } else { - genotier.exomiserExacNfeFreq = row[k] - } - } else if(value.includes('EXAC_SAS_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacSasFreq = '' - } else { - genotier.exomiserExacSasFreq = row[k] - } - } else if(value.includes('EXAC_OTH_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacOthFreq = '' - } else { - genotier.exomiserExacOthFreq = row[k] - } - } else if(value.includes('CONTRIBUTING_VARIANT')){ - if(row[k]=='.'){ - genotier.exomiserContributingVariant = '' - } else { - genotier.exomiserContributingVariant = row[k] - } - } - } - // save modified field - genotier.save((err,genotier) =>{ - if(err) { - console.log(err); - count = count+1; - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else { - fs.appendFileSync(logFileName, 'Genotier for location: '+genotier.fullLocation+' had been changed \n') - count = count+1; - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } - }) - } - } - }) - } else { - console.log('No location found') - } - } -}) \ No newline at end of file diff --git a/bin/application.properties b/bin/application.properties index 30f0af7..3b2e7fb 100644 --- a/bin/application.properties +++ b/bin/application.properties @@ -21,7 +21,7 @@ #root path where data is to be downloaded and worked on #it is assumed that all the files required by exomiser listed in this properties file #will be found in the data directory unless specifically overridden here. -exomiser.data-directory=/data/exomiser-data-bundle/ +exomiser.data-directory=/data/exomiser-data-bundle/exomiser-data-bundle/ ### hg19 assembly ### # exomiser.hg19.data-version=2102 @@ -49,7 +49,7 @@ exomiser.hg38.data-version=2102 #transcript source will default to ensembl. Can define as ucsc/ensembl/refseq exomiser.hg38.transcript-source=ensembl -exomiser.hg38.data-directory=/data/exomiser-data-bundle/2102_hg38 +exomiser.hg38.data-directory=/data/exomiser-data-bundle/exomiser-data-bundle/2102_hg38 #location of CADD/REMM Tabix files - you will need these for analysis of non-coding variants. #CADD can be downloaded from http://cadd.gs.washington.edu/download - v1.3 has been tested. #REMM can be downloaded from https://charite.github.io/software-remm-score.html @@ -57,17 +57,17 @@ exomiser.hg38.data-directory=/data/exomiser-data-bundle/2102_hg38 # #You will require the tsv.gz and tsv.gz.tbi (tabix) file pairs. #Un-comment and add the full path to the relevant tsv.gz files if you want to enable these. -exomiser.hg38.cadd-snv-path=/data/exomiser-data-bundle/cadd_snvs/whole_genome_SNVs.tsv.gz -exomiser.hg38.cadd-in-del-path=/data/exomiser-data-bundle/2102_hg38/gnomad.genomes.r3.0.indel.tsv.gz +exomiser.hg38.cadd-snv-path=/data/exomiser-data-bundle/exomiser-data-bundle/cadd_snvs/whole_genome_SNVs.tsv.gz +exomiser.hg38.cadd-in-del-path=/data/exomiser-data-bundle/exomiser-data-bundle/2102_hg38/gnomad.genomes.r3.0.indel.tsv.gz #exomiser.hg38.remm-path=${exomiser.hg38.data-directory}/ReMM.v0.3.1.tsv.gz #exomiser.hg38.local-frequency-path=${exomiser.hg38.data-directory}/local_frequency_test.tsv.gz -exomiser.hg38.variant-white-list-path=/data/exomiser-data-bundle/2102_hg38/2102_hg38_clinvar_whitelist.tsv.gz +exomiser.hg38.variant-white-list-path=/data/exomiser-data-bundle/exomiser-data-bundle/2102_hg38/2102_hg38_clinvar_whitelist.tsv.gz ### phenotypes ### exomiser.phenotype.data-version=2102 -exomiser.phenotype.data-directory=/data/exomiser-data-bundle/2102_phenotype +exomiser.phenotype.data-directory=/data/exomiser-data-bundle/exomiser-data-bundle/2102_phenotype #String random walk data file -exomiser.phenotype.random-walk-file-name=/data/exomiser-data-bundle/2102_phenotype/rw_string_10.mv +exomiser.phenotype.random-walk-file-name=/data/exomiser-data-bundle/exomiser-data-bundle/2102_phenotype/rw_string_10.mv #exomiser.phenotype.random-walk-index-file-name=rw_string_9_05_id2index.gz ### caching ### diff --git a/bin/auto_config.yml b/bin/auto_config.yml index 4de86dc..9e72def 100644 --- a/bin/auto_config.yml +++ b/bin/auto_config.yml @@ -6,8 +6,8 @@ analysis: # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt. genomeAssembly: hg38 vcf: vcf_placeholder - ped: - proband: + ped: ped_placeholder + proband: proband_placeholder hpoIds: [hpo_ids_placeholder] # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an # allele to be considered as a causative candidate under that mode of inheritance. @@ -80,20 +80,35 @@ analysis: #intervalFilter: {intervals: ['chr10:123256200-123256300', 'chr10:123256290-123256350']}, # or using a BED file - NOTE this should be 0-based, Exomiser otherwise uses 1-based coordinates in line with VCF #intervalFilter: {bed: /full/path/to/bed_file.bed}, - #failedVariantFilter: {}, + failedVariantFilter: {}, #genePanelFilter: {geneSymbols: ['FGFR1','FGFR2']}, ##################################################################################### - hiPhivePrioritiser: {}, + #hiPhivePrioritiser: {}, #running the prioritiser followed by a priorityScoreFilter will remove genes #which are least likely to contribute to the phenotype defined in hpoIds, this will #dramatically reduce the time and memory required to analyse a genome. # 0.501 is a good compromise to select good phenotype matches and the best protein-protein interactions hits from hiPhive - priorityScoreFilter: {priorityType: HIPHIVE_PRIORITY, minPriorityScore: 0.501}, + #priorityScoreFilter: {priorityType: HIPHIVE_PRIORITY, minPriorityScore: 0.501}, ###################################################################################### #variantEffectFilter: {remove: [SYNONYMOUS_VARIANT]}, #regulatoryFeatureFilter removes all non-regulatory non-coding variants over 20Kb from a known gene. - regulatoryFeatureFilter: {}, + #regulatoryFeatureFilter: {}, #knownVariantFilter: {}, #removes variants represented in the database + variantEffectFilter: { + remove: [ + FIVE_PRIME_UTR_EXON_VARIANT, + FIVE_PRIME_UTR_INTRON_VARIANT, + THREE_PRIME_UTR_EXON_VARIANT, + THREE_PRIME_UTR_INTRON_VARIANT, + NON_CODING_TRANSCRIPT_EXON_VARIANT, + NON_CODING_TRANSCRIPT_INTRON_VARIANT, + CODING_TRANSCRIPT_INTRON_VARIANT, + UPSTREAM_GENE_VARIANT, + DOWNSTREAM_GENE_VARIANT, + INTERGENIC_VARIANT, + REGULATORY_REGION_VARIANT + ] + }, frequencyFilter: {maxFrequency: 2.0}, pathogenicityFilter: {keepNonPathogenic: keep_non_pathogenic_placeholder}, #inheritanceFilter and omimPrioritiser should always run AFTER all other filters have completed @@ -103,21 +118,21 @@ analysis: omimPrioritiser: {}, #Other prioritisers: Only combine omimPrioritiser with one of these. #Don't include any if you only want to filter the variants. - #hiPhivePrioritiser: {}, + hiPhivePrioritiser: {} # or run hiPhive in benchmarking mode: #hiPhivePrioritiser: {diseaseId: 'OMIM:101600', candidateGeneSymbol: FGFR2, runParams: 'human,mouse,fish,ppi'}, #phenixPrioritiser: {} #exomeWalkerPrioritiser: {seedGeneIds: [11111, 22222, 33333]} - prioritiser_placeholder : {} + #prioritiser_placeholder : {} ] outputOptions: outputContributingVariantsOnly: false - #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results + #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results numGenes: 0 #outputPrefix options: specify the path/filename without an extension and this will be added - # according to the outputFormats option. If unspecified this will default to the following: + # according to the outputFormats option. If unspecified this will default to the following: # {exomiserDir}/results/input-vcf-name-exomiser-results.html - # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis + # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis outputPrefix: output_prefix_placeholder #out-format options: HTML, JSON, TSV_GENE, TSV_VARIANT, VCF (default: HTML) - outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] \ No newline at end of file + outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] diff --git a/bin/auto_config_V2.yml b/bin/auto_config_V2.yml new file mode 100644 index 0000000..9e72def --- /dev/null +++ b/bin/auto_config_V2.yml @@ -0,0 +1,138 @@ +## Exomiser Analysis Template. +# These are all the possible options for running exomiser. Use this as a template for +# your own set-up. +--- +analysis: + # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt. + genomeAssembly: hg38 + vcf: vcf_placeholder + ped: ped_placeholder + proband: proband_placeholder + hpoIds: [hpo_ids_placeholder] + # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an + # allele to be considered as a causative candidate under that mode of inheritance. + # If you just want to analyse a sample under a single inheritance mode, delete/comment-out the others. For AUTOSOMAL_RECESSIVE + # or X_RECESSIVE ensure *both* relevant HOM_ALT and COMP_HET modes are present. + # In cases where you do not want any cut-offs applied an empty map should be used e.g. inheritanceModes: {} + inheritanceModes: { + AUTOSOMAL_DOMINANT: 0.1, + AUTOSOMAL_RECESSIVE_HOM_ALT: 0.1, + AUTOSOMAL_RECESSIVE_COMP_HET: 2.0, + X_DOMINANT: 0.1, + X_RECESSIVE_HOM_ALT: 0.1, + X_RECESSIVE_COMP_HET: 2.0, + MITOCHONDRIAL: 0.2 + } + #FULL or PASS_ONLY + analysisMode: PASS_ONLY + #Possible frequencySources: + #Thousand Genomes project - http://www.1000genomes.org/ (THOUSAND_GENOMES) + #TOPMed - https://www.nhlbi.nih.gov/science/precision-medicine-activities (TOPMED) + #UK10K - http://www.uk10k.org/ (UK10K) + #ESP project - http://evs.gs.washington.edu/EVS/ (ESP_) + # ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL, + #ExAC project http://exac.broadinstitute.org/about (EXAC_) + # EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN, + # EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN, + # EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN, + # EXAC_OTHER + #gnomAD - http://gnomad.broadinstitute.org/ (GNOMAD_E, GNOMAD_G) + frequencySources: [ + THOUSAND_GENOMES, + TOPMED, + UK10K, + + ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL, + + EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN, + EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN, + EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN, + EXAC_OTHER, + + GNOMAD_E_AFR, + GNOMAD_E_AMR, +# GNOMAD_E_ASJ, + GNOMAD_E_EAS, + GNOMAD_E_FIN, + GNOMAD_E_NFE, + GNOMAD_E_OTH, + GNOMAD_E_SAS, + + GNOMAD_G_AFR, + GNOMAD_G_AMR, +# GNOMAD_G_ASJ, + GNOMAD_G_EAS, + GNOMAD_G_FIN, + GNOMAD_G_NFE, + GNOMAD_G_OTH, + GNOMAD_G_SAS + ] + #Possible pathogenicitySources: POLYPHEN, MUTATION_TASTER, SIFT, CADD, REMM + #REMM is trained on non-coding regulatory regions + #*WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files + #and updated their location in the application.properties. Exomiser will not run without this. + pathogenicitySources: [pathogenicity_sources_placeholder] + #this is the recommended order for a genome-sized analysis. + #all steps are optional + steps: [ + #intervalFilter: {interval: 'chr10:123256200-123256300'}, + # or for multiple intervals: + #intervalFilter: {intervals: ['chr10:123256200-123256300', 'chr10:123256290-123256350']}, + # or using a BED file - NOTE this should be 0-based, Exomiser otherwise uses 1-based coordinates in line with VCF + #intervalFilter: {bed: /full/path/to/bed_file.bed}, + failedVariantFilter: {}, + #genePanelFilter: {geneSymbols: ['FGFR1','FGFR2']}, + ##################################################################################### + #hiPhivePrioritiser: {}, + #running the prioritiser followed by a priorityScoreFilter will remove genes + #which are least likely to contribute to the phenotype defined in hpoIds, this will + #dramatically reduce the time and memory required to analyse a genome. + # 0.501 is a good compromise to select good phenotype matches and the best protein-protein interactions hits from hiPhive + #priorityScoreFilter: {priorityType: HIPHIVE_PRIORITY, minPriorityScore: 0.501}, + ###################################################################################### + #variantEffectFilter: {remove: [SYNONYMOUS_VARIANT]}, + #regulatoryFeatureFilter removes all non-regulatory non-coding variants over 20Kb from a known gene. + #regulatoryFeatureFilter: {}, + #knownVariantFilter: {}, #removes variants represented in the database + variantEffectFilter: { + remove: [ + FIVE_PRIME_UTR_EXON_VARIANT, + FIVE_PRIME_UTR_INTRON_VARIANT, + THREE_PRIME_UTR_EXON_VARIANT, + THREE_PRIME_UTR_INTRON_VARIANT, + NON_CODING_TRANSCRIPT_EXON_VARIANT, + NON_CODING_TRANSCRIPT_INTRON_VARIANT, + CODING_TRANSCRIPT_INTRON_VARIANT, + UPSTREAM_GENE_VARIANT, + DOWNSTREAM_GENE_VARIANT, + INTERGENIC_VARIANT, + REGULATORY_REGION_VARIANT + ] + }, + frequencyFilter: {maxFrequency: 2.0}, + pathogenicityFilter: {keepNonPathogenic: keep_non_pathogenic_placeholder}, + #inheritanceFilter and omimPrioritiser should always run AFTER all other filters have completed + #they will analyse genes according to the specified modeOfInheritance above- UNDEFINED will not be analysed. + inheritanceFilter: {}, + #omimPrioritiser isn't mandatory. + omimPrioritiser: {}, + #Other prioritisers: Only combine omimPrioritiser with one of these. + #Don't include any if you only want to filter the variants. + hiPhivePrioritiser: {} + # or run hiPhive in benchmarking mode: + #hiPhivePrioritiser: {diseaseId: 'OMIM:101600', candidateGeneSymbol: FGFR2, runParams: 'human,mouse,fish,ppi'}, + #phenixPrioritiser: {} + #exomeWalkerPrioritiser: {seedGeneIds: [11111, 22222, 33333]} + #prioritiser_placeholder : {} + ] +outputOptions: + outputContributingVariantsOnly: false + #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results + numGenes: 0 + #outputPrefix options: specify the path/filename without an extension and this will be added + # according to the outputFormats option. If unspecified this will default to the following: + # {exomiserDir}/results/input-vcf-name-exomiser-results.html + # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis + outputPrefix: output_prefix_placeholder + #out-format options: HTML, JSON, TSV_GENE, TSV_VARIANT, VCF (default: HTML) + outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] diff --git a/bin/genotiers.js b/bin/genotiers.js deleted file mode 100644 index d383a52..0000000 --- a/bin/genotiers.js +++ /dev/null @@ -1,213 +0,0 @@ -const mongoose = require('mongoose'); - -const genotiersSchema = new mongoose.Schema({ - acmgAppxScore: { - type: Number - }, - acmgBenignSubscore: { - type: String - }, - acmgCodingImpact: { - type: String - }, - acmgGeneId: { - type: Number - }, - acmgPathogenicSubscore: { - type: String - }, - acmgUserExplain: { - type: Array - }, - acmgVerdict: { - type: String - }, - acmgVersion: { - type: String - }, - allelicBalance: { - type: Number - }, - alternative: { - type: String - }, - all1: { - type: String - }, - all2: { - type: String - }, - ampApproxScore: { - type: String - }, - ampClassifications: { - type: String - }, - ampClassificationsTier: { - type: String - }, - ampName: { - type: String - }, - ampTier1: { - type: String - }, - ampTier2: { - type: String - }, - ampTier3: { - type: String - }, - ampTier4: { - type: String - }, - ampVerdict: { - type: String - }, - ampVerdictTier: { - type: String - }, - ampVersion: { - type: String - }, - cbio: { - type: String - }, - chromosome: { - type: String, - required: true - }, - coverage: { - type: Number - }, - fullLocation: { - type: String, - required: true - }, - gene: { - type: String - }, - genotype: { - type: String - }, - i: { - type: String, - required: true - }, - location: { - type: String, - required: true - }, - notes: { - type: String - }, - position: { - type: Number, - required: true - }, - reference: { - type: String - }, - barcode: { - type: String - }, - variantType: { - type: String - }, - vcfSampleId: { - type: String - }, - zygosity: { - type: String - }, - exomiserQual: { - type: String - }, - exomiserFilter: { - type: String - }, - exomiserGenotype: { - type: String - }, - exomiserHgvs: { - type: String - }, - exomiserCoverage: { - type: String - }, - exomiserFunctionalClass: { - type: String - }, - exomiserVariantScore: { - type: String - }, - exomiserGene: { - type: String - }, - exomiserGenePhenoScore: { - type: String - }, - exomiserGeneVariantScore: { - type: String - }, - exomiserGeneCombinedScore: { - type: String - }, - exomiserCadd: { - type: String - }, - exomiserPolyphen: { - type: String - }, - exomiserMutationTaster: { - type: String - }, - exomiserSift: { - type: String - }, - exomiserRemm: { - type: String - }, - exomiserDbsnp: { - type: String - }, - exomiserMaxFreq: { - type: String - }, - exomiserDbsnpFreq: { - type: String - }, - exomiserEvsEaFreq: { - type: String - }, - exomiserEvsAaFreq: { - type: String - }, - exomiserExacAfrFreq: { - type: String - }, - exomiserExacAmrFreq: { - type: String - }, - exomiserExacEasFreq: { - type: String - }, - exomiserExacFinFreq: { - type: String - }, - exomiserExacNfeFreq: { - type: String - }, - exomiserExacSasFreq: { - type: String - }, - exomiserExacOthFreq: { - type: String - }, - exomiserContributingVariant: { - type: String - } -}) - -genotiersSchema.index({fullLocation: 1, i: 1 }, { unique: true }) -mongoose.model('Genotiers', genotiersSchema, 'genotiers'); \ No newline at end of file diff --git a/bin/get_hpo_terms_from_barcode.js b/bin/get_hpo_terms_from_barcode.js deleted file mode 100644 index 38bcb37..0000000 --- a/bin/get_hpo_terms_from_barcode.js +++ /dev/null @@ -1,119 +0,0 @@ -// GET HPO TERMS FROM SAMPLE ID is a script -// which looks for participant ID (more specifically the field `i`) in first collection specified and -// then uses found id ('i') to find HP terms in second collection specified - -// To run: -// minimum: -// node --no-warnings ./bin/get_hpo_terms_from_barcode.js --barcode 000000001 -// full set: -// node ./bin/get_hpo_terms_from_barcode.js --barcode '000000001' --dbWithPatientSamples clinical-portal --uriWithPatientSamples mongodb://localhost:27017 --collectionPatientSamples patientsamples --dbWithParticipants cohort-browser --collectionParticipants participants --uriWithParticipants mongodb://localhost:27017 - -// required packages -const { program } = require('commander'); -const fs = require('fs') - -// Take inputs from command line -program - .description('A script that, given a barcode, retrieves the corresponding participant ID (more specifically `i`) and HPO terms') - .option('-b, --barcode ', '547300000450') - .option('-ds, --dbWithPatientSamples ', 'Database containing the collection called "patientsamples"', "clinical-portal") - .option('-us, --uriWithPatientSamples ', 'Uri to database containing the collection called "patientsamples"', "mongodb://localhost:27017") - .option('-cs, --collectionPatientSamples ', 'Collection name for samples', "patientsamples") - .option('-dp, --dbWithParticipants ', 'Database containing the collection called "participants"', "cohort-browser") - .option('-up, --uriWithParticipants ', 'Uri to database containing the collection called "participants"', "mongodb://localhost:27017") - .option('-cp, --collectionParticipants ', 'Collection name for participants', "participants") -program.parse(); - -var barcode = program.opts().barcode - -var dbNameSamples = program.opts().dbWithPatientSamples -var uriSamples = program.opts().uriWithPatientSamples -var collectionSamples = program.opts().collectionPatientSamples - -var dbNameParticipants = program.opts().dbWithParticipants -var uriParticipants = program.opts().uriWithParticipants -var collectionParticipants = program.opts().collectionParticipants - -// initiate variables -var fileName = barcode+'_hpo_list.txt' - -// initialise output file -fs.writeFileSync(fileName, ''); -fs.appendFileSync(fileName, '# '+barcode+' \n') - -// connect to mongoDB -var MongoClient = require('mongodb',{ useUnifiedTopology: true }).MongoClient; - -// use promise to find first query results -var promise = new Promise((resolve, reject) => { - MongoClient.connect(uriSamples,{poolSize: 1000}, function(err, db) { - if (err) throw err; - console.log('connected to database 1') - var database = db.db(dbNameSamples); - // prepare first query - var query = { 'barcode': barcode }; - // query database - database.collection(collectionSamples).find(query).toArray(function(err, results) { - if(err) { - reject(err) - db.close() - } else if(!results){ - reject('No results') - db.close(); - } else if(Array.isArray(results)){ - if(results.length==0){ - reject('No results') - db.close() - } else { - // when results are created resolve promise - resolve(results[0].i) - db.close() - } - } - }); - }) - }); -// use results from first promise to query again -promise.then(firstQueryResults =>{ - MongoClient.connect(uriParticipants,{poolSize: 1000}, (err, db2) =>{ - if (err) throw err; - console.log('connected to database 2') - var database2 = db2.db(dbNameParticipants); - // create new query - var newQuery = {'i': firstQueryResults} - // query database - database2.collection(collectionParticipants).find(newQuery).toArray((err, results2)=> { - if(err){ - console.log(err) - db2.close() - } else if(!results2){ - console.log('No results') - db2.close() - } else { - if(results2.length == 0){ - console.log('No results') - db2.close() - } else { - results2.forEach(instance=>{ - for (var key in instance) { - if (instance.hasOwnProperty(key)) { - // check if values have 'HP' in value - if(JSON.stringify(instance[key]).includes('HP')){ - var hpTerm = JSON.stringify(instance[key]) - if((JSON.stringify(instance[key]).includes('(')) && (JSON.stringify(instance[key]).includes(')'))){ - hpTerm = hpTerm.split('(')[1].split(')')[0] - } - // save results in output file - fs.appendFileSync(fileName, hpTerm.replace(/['"]+/g, '')+'\n') - } - } - } - }) - db2.close() - } - } - }) - }) -}).catch(err=>{ - console.log(err) -}) diff --git a/bin/ped_module.py b/bin/ped_module.py new file mode 100755 index 0000000..5d3de65 --- /dev/null +++ b/bin/ped_module.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# RPC 291122 +# Aim take family file and convert to passed + +from ped_parser import Individual, Family +import pandas as pd +import os +import argparse + +# local test +# os.chdir("/Users/ryancardenas/Documents/exomiser-pipeline-nf/bin") +# input_dat = pd.read_csv("familytest.tsv", sep="\t") + +#build arg parser here +parser = argparse.ArgumentParser(description='Create PED file from family file - Exomiser') +parser.add_argument('--input_family', nargs=1, required=True, help='Enter the path for the family TSV file') +args = parser.parse_args() + + +#bamfile set +input = str(args.input_family[0]) +input_dat = pd.read_csv(input, sep="\t") + +# --------------- create function for ped_parser +def PED_function(run_ID, proband_ID, vcf_path, vcf_index, proband_sex, mother_ID, father_ID): + # Extract + output_name = (f"{proband_ID}_tmp.ped") + outfile = open(output_name,'a') + my_individuals = [] + print(f"{output_name}") + # extract filename without extention or path + file_name = os.path.basename(f"{vcf_path}") + file_base = os.path.splitext(file_name)[0] + + proband_sex = proband_sex.lower() + + if proband_sex == 'm' or proband_sex == 'male': + proband_sex2 = "1" + elif proband_sex == 'f' or proband_sex == 'female': + proband_sex2 = "2" + elif proband_sex == 'other': + proband_sex2 = "0" + else: + proband_sex2 = proband_sex + + #proband_ID + my_individuals.append(Individual( + f'{proband_ID}', + family=f'{run_ID}', + mother=f'{mother_ID}', + father=f'{father_ID}', + sex=f'{proband_sex2}', + phenotype='2' + )) + #mother + my_individuals.append(Individual( + f'{mother_ID}', + family=f'{run_ID}', + mother='0', + father='0', + sex='2', + phenotype='1' + )) + #father + my_individuals.append(Individual( + f'{father_ID}', + family=f'{run_ID}', + mother='0', + father='0', + sex='1', + phenotype='1' + )) + my_family = Family(family_id=f'{run_ID}') + for individual in my_individuals: + my_family.add_individual(individual) + + # save PED files + my_family.to_ped(outfile) + + + +for index, row in input_dat.iterrows(): + + # define variables + run_id1 = row["run_id"] + proband_id1 = row["proband_id"] + hpo1 = row["hpo"] + mother_id1 = row["mother_id"] + father_id1 = row["father_id"] + vcf_path1 = row["vcf_path"] + vcf_index_path1 = row["vcf_index_path"] + proband_sex1 = row["proband_sex"] + + PED_function(run_id1,proband_id1, vcf_path1, vcf_index_path1, proband_sex1, mother_id1, father_id1) + + # create HPO file here. + os.system(f"rm -fr {proband_id1}-HPO.txt" ) + os.system(f"echo '{hpo1}' > {proband_id1}-HPO.txt") + + #create proband_id into text_file + os.system(f"echo '{proband_id1}' > {proband_id1}_ID.txt") + + # filter PEDs to only have proband_id + # Strangely despite loops the file appends each family is added + cmd_strip = f"grep -A 2 '{proband_id1}' {proband_id1}_tmp.ped > {proband_id1}.ped" + os.system(cmd_strip) diff --git a/bin/ped_module_man.py b/bin/ped_module_man.py new file mode 100755 index 0000000..e2a5d7c --- /dev/null +++ b/bin/ped_module_man.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# RPC 291122 +# Aim take family file and convert to passed +import pandas as pd +import os +import argparse +from pathlib import Path + +# local test +# os.chdir("/Users/ryancardenas/Documents/exomiser-pipeline-nf/bin") +# input_dat = pd.read_csv("familytest.tsv", sep="\t") + +#build arg parser here +parser = argparse.ArgumentParser(description='Create PED file from family file - Exomiser') +parser.add_argument('--input_family', nargs=1, required=True, help='Enter the path for the familt TSV file') +args = parser.parse_args() + + +#bamfile set +input = str(args.input_family[0]) +input_dat = pd.read_csv(input, sep="\t", skipinitialspace = True) + + +def PED_function(run_ID, proband_ID, vcf_path, vcf_index, proband_sex, mother_ID, father_ID): + # Extract + output_name = (f"{proband_ID}.ped") + print(f"creating {proband_ID}.ped") + + text_file = open(f"{output_name}", "w") + + # extract filename without extention or path + file_base = Path(f"{vcf_path}").stem + file_base = Path(f"{file_base}").stem + + if proband_sex == 'M' or proband_sex == 'Male': + proband_sex2 = "1" + elif proband_sex == 'F' or proband_sex == 'Female': + proband_sex2 = "2" + else: + proband_sex2 = proband_sex + + template = f"""#FamilyID\tIndividualID\tPaternalID\tMaternalID\tSex\tPhenotype +{run_ID}\t{proband_ID}\t{father_ID}\t{mother_ID}\t{proband_sex2}\t2 +{run_ID}\t{mother_ID}\t0\t0\t2\t1 +{run_ID}\t{father_ID}\t0\t0\t1\t1 + """ + print(template) + #save PED using bash + n = text_file.write(template) + text_file.close() + print(f"finished {proband_ID}.ped") + +for index, row in input_dat.iterrows(): + + # define variables + run_id1 = row["run_id"] + proband_id1 = row["proband_id"] + hpo1 = row["hpo"] + mother_id1 = row["mother_id"] + father_id1 = row["father_id"] + vcf_path1 = row["vcf_path"] + vcf_index_path1 = row["vcf_index_path"] + proband_sex1 = row["proband_sex"] + + PED_function(run_id1,proband_id1, vcf_path1, vcf_index_path1, proband_sex1, mother_id1, father_id1) + + # create HPO file here. + os.system(f"rm -fr {proband_id1}-HPO.txt" ) + os.system(f"echo '{hpo1}' > {proband_id1}-HPO.txt") diff --git a/conf/family_test.config b/conf/family_test.config new file mode 100644 index 0000000..5f2e5af --- /dev/null +++ b/conf/family_test.config @@ -0,0 +1,7 @@ +params { + families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file.tsv' + prioritisers = 'hiPhivePrioritiser' + exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' +} diff --git a/conf/multi_hpo_test.config b/conf/multi_hpo_test.config new file mode 100644 index 0000000..38e67c3 --- /dev/null +++ b/conf/multi_hpo_test.config @@ -0,0 +1,7 @@ +params { + families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file_multi_hpo.tsv' + prioritisers = 'hiPhivePrioritiser' + exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' +} diff --git a/conf/single_vcf_test.config b/conf/single_vcf_test.config new file mode 100644 index 0000000..19eff49 --- /dev/null +++ b/conf/single_vcf_test.config @@ -0,0 +1,8 @@ +params { + families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/single_vcf.tsv' + hpo_terms_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/hpo_terms_file.txt' + prioritisers = 'hiPhivePrioritiser' + exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' +} diff --git a/containers/ped_parser/Dockerfile b/containers/ped_parser/Dockerfile new file mode 100644 index 0000000..201a3a3 --- /dev/null +++ b/containers/ped_parser/Dockerfile @@ -0,0 +1,23 @@ +FROM continuumio/miniconda3@sha256:77f9119def83d94b7afb654b39a1c21aaa7f255518aba57de08321760c27c86a + +ENV VERSION="1.6.6-py_2" + +ARG ENV_NAME="ped-parser" + +LABEL description="Docker containing the ped_parser python package." \ + software.version="${VERSION}" \ + maintainer="Leila Mansouri: leila.mansouri@lifebit.ai" \ + name="quay.io/lifebitaiorg/ped_parser:1.6.6-py_2" + +#needed as per the documentation +RUN apt-get update -y &&\ + apt-get install -y procps \ + zlib1g &&\ + rm -rf /var/lib/apt/lists/* + +#installing the tool and its dependencies +RUN pip install ped_parser + +RUN pip install pandas + +ENTRYPOINT [ ] diff --git a/main.nf b/main.nf index 2f8fa43..a64bb00 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow - import groovy.json.* + /* ======================================================================================== lifebit-ai/exomiser-nf @@ -16,13 +16,13 @@ c_white = "\033[0;37m"; c_yellow = "\033[0;33m"; c_purple = "\033[0;35m"; -sample_name = params.sample_name // Header log info log.info "-${c_purple}\nPARAMETERS SUMMARY${c_reset}-" log.info "-${c_teal}config:${c_reset}- ${params.config}" -log.info "-${c_teal}input:${c_reset}- ${params.input}" -log.info "-${c_teal}sample_name:${c_reset}- ${sample_name}" -log.info "-${c_teal}filename_hpo:${c_reset}- ${params.filename_hpo}" +log.info "-${c_teal}filename_design_file:${c_reset}- ${params.families_file}" +if(params.hpo_file) log.info "-${c_teal}filename_hpo:${c_reset}- ${params.filename_hpo}" +if(params.ped_file) log.info "-${c_teal}filename_ped:${c_reset}- ${params.ped_file}" +if(params.families_file) log.info "-${c_teal}families_file:${c_reset}- ${params.families_file}" log.info "-${c_teal}analysis_mode:${c_reset}- ${params.analysis_mode}" log.info "-${c_teal}exomiser_data:${c_reset}- ${params.exomiser_data}" log.info "-${c_teal}exomiser_phenotype_data:${c_reset}- ${params.exomiser_phenotype_data}" @@ -34,7 +34,6 @@ log.info "-${c_teal}min_priority_score:${c_reset}- ${params.min_priority_score}" log.info "-${c_teal}application_properties:${c_reset}- ${params.application_properties}" log.info "-${c_teal}auto_config_yml:${c_reset}- ${params.auto_config_yml}" log.info "-${c_teal}exomiser_data_directory:${c_reset}- ${params.exomiser_data_directory}" -log.info "-${c_teal}hpo terms from a file:${c_reset}- ${params.hpo_terms_file}" log.info "-${c_teal}exomiser_container_tag:${c_reset}- ${params.exomiser_container_tag}" log.info "-${c_teal}debug_script:${c_reset}- ${params.debug_script}" log.info "-${c_teal}echo:${c_reset}- ${params.echo}" @@ -46,23 +45,29 @@ log.info "" // Check input parameters // ---------------------------------------------------*/ -if(params.input) { - Channel - .fromPath( "${params.input}" ) - .ifEmpty { exit 1, "VCF file: ${params.input} not found"} - .into { ch_vcf ; ch_vcf_inspect; ch_vcf_for_geneyx } +if(params.families_file) { + Channel + .fromPath( "${params.families_file}") + .ifEmpty { exit 1, "Family file: ${params.families_file} not found"} + .set {ch_families_file} } else { - exit 1, "please specify VCF file with --input parameter" + exit 1, "please specify Family file with --families_file parameter" } + + +Channel + .fromPath(params.families_file) + .ifEmpty { exit 1, "Cannot find input file : ${params.families_file}" } + .splitCsv(header:true, sep:'\t', strip: true) + .map {row -> [ row.proband_id, file(row.vcf_path), file(row.vcf_index_path)] } + .into {ch_vcf_paths; ch_vcf_paths2} + // Conditional creation of channels, custom if provided else default from bin/ projectDir = workflow.projectDir ch_application_properties = params.application_properties ? Channel.value(file(params.application_properties)) : Channel.fromPath("${projectDir}/bin/application.properties") -ch_auto_config_yml = params.auto_config_yml ? Channel.value(file(params.auto_config_yml)) : Channel.fromPath("${projectDir}/bin/auto_config.yml") +ch_auto_config_yml = params.auto_config_yml ? Channel.value(file(params.auto_config_yml)) : Channel.fromPath("${projectDir}/bin/auto_config.yml") -// Stage scripts from bin -ch_add_exomiser_fields_script = Channel.value(file("${projectDir}/bin/add_exomiser_fields_to_genotiers.js")) -ch_get_hpo_terms_script = Channel.value(file("${projectDir}/bin/get_hpo_terms_from_barcode.js")) // set exomiser specific flags pathogenicitySourcesList= definePathogenicitySources() @@ -78,20 +83,45 @@ println(selected_prioritisers) selected_analysis_mode = params.analysis_mode.split(',').collect{it.trim()} if (!checkParameterList(selected_analysis_mode, analysisModesList)) exit 1, "Unknown analysis mode, the available options are:\n$analysisModesList" - -// Prevent an error in AWSBatch (when running by awsbatch executor) -// by which this file is taken as /home/ubuntu/hpo_terms_file.txt instead of its correct path. -hpo_terms_filename = "${projectDir}/${params.hpo_terms_file}" + +ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") +/*-------------------------------------------------- + Create PED and HPO file from design +---------------------------------------------------*/ + +//remove +//ch_vcf_inspect.dump(tag:'ch_vcf') +// if (params.ped_file) ped_ch = Channel.value(file(params.ped_file)) +// if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) + +// if(!params.ped_file & !params.hpo_file){ + process ped_hpo_creation { + container 'quay.io/lifebitaiorg/ped_parser:1.6.6' + publishDir "${params.outdir}/familyfile/", mode: 'copy' + errorStrategy 'retry' + maxErrors 5 + input: + set proband_id1, file(vcf_path1), file(vcf_index_path1) from ch_vcf_paths + file family_file from ch_families_file.collect() + output: + tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into ch_to_join + script: + """ + ped_module.py --input_family ${family_file} + #to change nan in 0s if there are any + sed -i 's/nan/0/g' ${proband_id1}.ped + #to remove the "parent" line if it's a single sample + sed -i "/0\t0\t0/d" ${proband_id1}.ped + """ + } +// } + +/*-------------------------------------------------- + Run containarised Exomiser +---------------------------------------------------*/ -Channel.fromPath("${params.hpo_terms_file}") - .splitCsv(sep: ',', skip: 1) - .unique() - .map {it -> it.toString().replaceAll("\\[", "").replaceAll("\\]", "")} - .map {it -> "'"+it.trim()+"'"} - .reduce { a, b -> "$a,$b" } - .into { ch_hpo_terms_file ; ch_hpo_terms_file_inspect; ch_hpo_terms } -ch_hpo_terms_file_inspect.dump(tag:'ch_hpo_terms (retrieve_hpo_terms: false)') +ch_combined = ch_vcf_paths2.join(ch_to_join, by: 0).view() /*-------------------------------------------------- Run containarised Exomiser @@ -99,22 +129,19 @@ ch_hpo_terms_file_inspect.dump(tag:'ch_hpo_terms (retrieve_hpo_terms: false)') ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") -ch_vcf_inspect.dump(tag:'ch_vcf') + process exomiser { - tag "${vcf}-${prioritiser}" - publishDir "${params.outdir}/${sample_name}", mode: 'copy' + tag "${vcf_path1}" + submitRateLimit = '1 / 5 m' + publishDir "${params.outdir}/${proband_id1}", mode: 'copy' + publishDir "${params.outdir}/", mode: 'copy', pattern: "MultiQC/multiqc_report.html" input: - file(vcf) from ch_vcf - //The following is expected when CADD is omitted, - // WARN: Input tuple does not match input set cardinality declared by process `exomiser` - // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) - // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused - file(application_properties) from ch_application_properties - file(auto_config_yml) from ch_auto_config_yml - file(exomiser_data) from ch_exomiser_data - val(hpo_terms) from ch_hpo_terms + set val(proband_id1),file(vcf_path1),file(vcf_index1), file(hpo_file), file(ped_file),file(id_file) from ch_combined + each file(application_properties) from ch_application_properties + each file(auto_config_yml) from ch_auto_config_yml + each file(exomiser_data) from ch_exomiser_data each prioritiser from selected_prioritisers output: @@ -122,67 +149,55 @@ process exomiser { file("*AR.variants.tsv") optional true file("*yml") optional true file("MultiQC/*.html") optional true - script: final_step = "finished" if (!params.mock_exomiser) { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ + #ls -la + #echo "Contents in PED" # link the staged/downloaded data to predefined path mkdir -p /data mkdir -p /data/exomiser-data-bundle - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - - ls -l - # Workaround for symlinked files not found - HPO_TERMS="${hpo_terms}" - - # error if no HPO term found - if [[ "\${HPO_TERMS}" == "null" ]]; then - echo "WARNING: No HPO terms found. So this step of exomiser is skipped, No report will be generated." - echo "Please check HPO terms for the patient in the clinical-portal for whom this sample belongs - ${sample_name}" - # solutions for AWS batch - touch no_hpo_term.html - touch no_hpo_term.vcf - touch no_hpo_term.json - touch no_hpo_term.yml - mkdir -p MultiQC - touch MultiQC/no_hpo_term.html - - else - # Modify auto_config.to pass the params - cp ${auto_config_yml} new_auto_config.yml - - # Swap placeholders with user provided values - sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml - sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml - sed -i "s/vcf_placeholder/${vcf}/" new_auto_config.yml - sed -i "s/output_prefix_placeholder/sample-${vcf.simpleName}/" new_auto_config.yml - sed -i "s/prioritiser_placeholder/${prioritiser}/" new_auto_config.yml - sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/" new_auto_config.yml - sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/" new_auto_config.yml - sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/" new_auto_config.yml - - # Printing (ls, see files; cat, injected values validation) - ${params.debug_script} - cat new_auto_config.yml - - # Run Exomiser - ${exomiser} \ - --analysis new_auto_config.yml \ - --spring.config.location=$application_properties \ - --exomiser.data-directory='.' - - # Create the slot for CloudOS html report preview - mkdir MultiQC - cp *.html MultiQC/multiqc_report.html - sed -i "s/Anonymous/${sample_name}/" MultiQC/multiqc_report.html - fi + ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + #stat -L $vcf_path1 + #stat -L $vcf_path1 > out.txt + #cat out.txt + proband_id1=`cat ${id_file}` + hpo_band1=`cat ${hpo_file}` + #echo \$proband_id1 + # Modify auto_config.to pass the params + cp ${auto_config_yml} new_auto_config.yml + # Swap placeholders with user provided values + sed -i "s/hpo_ids_placeholder/\$hpo_band1/g" new_auto_config.yml + sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml + sed -i "s/vcf_placeholder/${vcf_path1}/g" new_auto_config.yml + sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/g" new_auto_config.yml + sed -i "s/prioritiser_placeholder/${prioritiser}/g" new_auto_config.yml + sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml + sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml + sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml + sed -i "s/ped_placeholder/${ped_file}/g" new_auto_config.yml + sed -i "s/proband_placeholder/\$proband_id1/g" new_auto_config.yml + # Printing (ls, see files; cat, injected values validation) + #${params.debug_script} + #cat new_auto_config.yml + # Run Exomiser + ${exomiser} \ + --analysis new_auto_config.yml \ + --spring.config.location=$application_properties \ + --exomiser.data-directory='.' + # Create the slot for CloudOS html report preview + mkdir MultiQC + cp *.html MultiQC/multiqc_report.html + + sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html + """ }else{ """ - wget -O ${sample_name}.tsv ${params.mock_exomiser_output_https_url} + wget -O ${proband_id1}.tsv ${params.mock_exomiser_output_https_url} """ } } @@ -219,7 +234,7 @@ def checkParameterList(list, realList) { } /*-------------------------------------------------- - Definitions of accepted values for params + Definitions of accepted values for params ---------------------------------------------------*/ diff --git a/nextflow.config b/nextflow.config index 454dc6b..acb2791 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,8 +23,8 @@ params { exomiser_phenotype_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/2102_phenotype' cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' phenix_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/phenix' - application_properties = false - auto_config_yml = false + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config_v2.yml' hpo_terms_file = false modes_of_inheritance = 'AUTOSOMAL_DOMINANT,AUTOSOMAL_RECESSIVE,X_RECESSIVE,UNDEFINED' prioritisers = 'hiPhivePrioritiser,phivePrioritiser,phenixPrioritiser' @@ -36,11 +36,14 @@ params { exomiser_version = '12.1.0' exomiser_data_directory = '/data/exomiser-data-bundle' + //inputs + families_file = false + // Debugging related parameters debug_script = "ls -l" echo = false errorStrategy = 'terminate' - + // container versions exomiser_container_tag = '12.1.0' cloudos_cli_container_tag = '0.0.2' @@ -50,17 +53,28 @@ params { aws_batch_cli_path = '/home/ec2-user/miniconda/bin/aws' aws_batch_fetch_instance_type = true aws_region = 'ap-east-1' + + //process resources + memory = 6.GB + cpus = 4 + maxForks = 1 + //submitRateLimit = '1/5min' + errorStrategy = 'retry' + maxRetries = 3 } profiles { - standard { includeConfig params.config } - awsbatch { includeConfig 'conf/executors/awsbatch.config' } - eu_west_1 { includeConfig 'conf/cloud-region/eu_west_1.config' } - eu_west_2 { includeConfig 'conf/cloud-region/eu_west_2.config' } - test_full { includeConfig "conf/tests/full/test_full.config" } - ci_test_data { includeConfig "conf/data/ci_test_data.config" } - quay { includeConfig 'conf/containers/quay.config' } + standard { includeConfig params.config } + test_family { includeConfig 'conf/family_test.config' } + test_single_vcf { includeConfig 'conf/single_vcf_test.config' } + test_multi_hpo { includeConfig 'conf/multi_hpo_test.config' } + awsbatch { includeConfig 'conf/executors/awsbatch.config' } + eu_west_1 { includeConfig 'conf/cloud-region/eu_west_1.config' } + eu_west_2 { includeConfig 'conf/cloud-region/eu_west_2.config' } + test_full { includeConfig "conf/tests/full/test_full.config" } + ci_test_data { includeConfig "conf/data/ci_test_data.config" } + quay { includeConfig 'conf/containers/quay.config' } } process { @@ -68,8 +82,12 @@ process { errorStrategy = params.errorStrategy withName: exomiser { container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" - containerOptions = "--volume ${params.exomiser_data_directory}:/data/exomiser-data-bundle/" - memory = 6.GB - cpus = 4 + containerOptions = "--volume ${params.exomiser_data_directory}:/data/" + memory = params.memory + cpus = params.cpus + maxForks = params.maxForks + //submitRateLimit = params.submitRateLimit + errorStrategy = params.errorStrategy + maxRetries = params.maxRetries } } \ No newline at end of file diff --git a/testdata/fam_file.tsv b/testdata/fam_file.tsv new file mode 100644 index 0000000..e83485b --- /dev/null +++ b/testdata/fam_file.tsv @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file diff --git a/testdata/fam_file_multi_hpo.tsv b/testdata/fam_file_multi_hpo.tsv new file mode 100644 index 0000000..6287f81 --- /dev/null +++ b/testdata/fam_file_multi_hpo.tsv @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156,HP:0001363,HP:0011304 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file diff --git a/testdata/single_vcf.tsv b/testdata/single_vcf.tsv new file mode 100644 index 0000000..8acb569 --- /dev/null +++ b/testdata/single_vcf.tsv @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/ERR3239334_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/ERR3239334_small.vcf.gz.tbi M nan nan \ No newline at end of file