Configuration File
The entire contents of the configuration file located at ./Iliad/config/config.yaml
. Read the comments and subheaders to see the breakdown and explanation of the workflow options.
The most important variables that a user can defined are also stated within each How-To guide.
# The following configuration file holds very important variables that will help operate I L I A D.
# There are many provided download links that I L I A D will automatically download when you, the user, invoke it.
# All of these come with no warranties. Needless to say, but links tend to break over time!
# We will do our best to keep them up-to-date.
# Feel free to replace any links with your own preferences of files.
# Again, this comes with no warranties.
# __Author__ = Noah Herrick
# __Email__ = noahherrick1@gmail.com
# __Software__ = Iliad: Suite of Snakemake Genomic Data Processing Workflows
# __License__ = MIT License
# __copyright__ = Copyright 2023, Noah Herrick
# __Year__ = 2023
# __Version__ = 1.0.0
#####################################
#####################################
#####################################
# # # USER INPUT VARIABLES # # #
#####################################
#####################################
#####################################
# You must insert your /PATH/TO/Iliad/
# use 'pwd' command to find your current working directory when you are inside of Iliad directory
# e.g. /user/name/projects/Iliad/ <---- must include forward slash at the end of working directory path
# must include forward slash, '/', at the end of working directory path
workdirPath: NEED PATH HERE
##############################################################################################
### --- Default tables and samples for Raw Sequence (FASTQ) and Stored Sequence (CRAM) --- ###
# ------------------------------------------------------------------------------------------ #
# for downloading FastQ raw seq data make sure there is an Excel table or csv document with two columns and no header: Sample_Name,FTP_url
# e.g KPGP-00127,ftp://ftp.kobic.re.kr/pub/KPGP/2020_release_candidate/WGS_SR/KPGP-00127/KPGP-00127_L1_R1.fq.gz
samplesDict: config/UserSampleTable.csv
# must include list of samples in one column with "sample" header, no matter if you use the download feature or if you re-direct ILIAD to FASTQ data path
samples: config/samples.tsv
# Same setup as above for if you are retrieving CRAM files from an FTP server
cramSamplesDict: config/cramSampleTable.csv
cramSamples: config/cramSamples.tsv
###########################################################
### --- Default GENOME REFERENCE ASSEMBLY retrieval --- ###
# ------------------------------------------------------- #
# If you want to automatically download reference genome assembly, configure below AutoRetrieveReference as 'true' - otherwise leave blank!
AutoRetrieveReference: true # default is true
# If you already have specific reference genome assembly, configure below IhaveReference as 'true',
# place into your ./Iliad/resources/ directory,
# and configure filename below
# - otherwise leave blank!
IhaveReference: # default is blank
# If you have your own reference file to use, state the filePath
# - DO NOT REMOVE "resources/".
# It MUST be in the your "./Iliad/resources/" directory like so ./Iliad/resources/FILENAME
reference:
filePath: resources/GRCh38_full_analysis_set_plus_decoy_hla.fa # This is a popular example that you might already have filed away
############################################################
### --- Default VARIANT CALLING options via BCFtools --- ###
# -------------------------------------------------------- #
# BCFtools manual LINK: https://samtools.github.io/bcftools/bcftools.html
# BCFtools cheat sheet LINK: https://gist.github.com/elowy01/93922762e131d7abd3c7e8e166a74a0b
VariantCalling:
# # See BCFtools manual for adding additional options, e.g. for base alignment quality "-B". Just add options within bounds of quotations
mpileup:
options: "-d 8000 -B" # default is → -d 8000 -B
call:
options: "-m -A" # default is → -m -A
# Normalize and Left-align - configure below Normalize as 'true' - otherwise leave blank!
Normalize: true
# # See BCFtools manual for adding additional options. Just add options within bounds of quotations.
# Current options in effect when "Normalize: true" are "norm -f {reference}"
# you can add other flags using the 'options: "[add more options here]"' below
Norm:
options: "" # default is blank
# DO NOT Normalize and Left-align - configure below doNotNormalize as 'true' - otherwise leave blank!
doNotNormalize: # default is blank - benchmarked as true
################################################
### --- Lift and Merge Submodule Options --- ###
# -------------------------------------------- #
# place the appropriate BASE of each filename under the file header "baseFileName_VCF"
# i.e. if FILENAME.vcf, then the BASE is "FILENAME".
# These can be either compressed (.vcf.gz and .vcf.gz.[tbi/csi]) or uncompressed (.vcf).
# a compressed file will need the associated index file in the directory, too.
vcfs: config/mergeTheseVCFs.txt
LiftoverTF: true # default is true
# update your genomic positions to Homo sapiens GRCh38 reference assembly - configure below Version38 as 'true' - otherwise mark 'false'!
Version38: true # default is true
# update your genomic positions to Homo sapiens GRCh37 reference assembly - configure above Version38 as 'false'
dbsnpLiftMerge:
desiredVersion: GRCh38
projectName: Demo
#----------- 37 -------------
dbsnp37VcfDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/All_20180423.vcf.gz
dbsnp37TbiDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/All_20180423.vcf.gz.tbi
file37: All_20180423.vcf.gz
#----------- 38 -------------
dbsnp38VcfDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/All_20180418.vcf.gz
dbsnp38TbiDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/All_20180418.vcf.gz.tbi
file38: All_20180418.vcf.gz
genomeReference:
#----------- 37 -------------
37Reference: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37
file37: human_g1k_v37.fasta
#----------- 38 -------------
38Reference: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/
file38: GRCh38_full_analysis_set_plus_decoy_hla.fa
index38: GRCh38_full_analysis_set_plus_decoy_hla.fa.fai
#############################################
### --- dbSNP annotation file options --- ###
# ----------------------------------------- #
# used in Raw Sequence Module, Stored Sequence Module, SNP Array Module
# the uncommented configuration options will be used for these modules.
# switch the commented/uncommented three configuration lines if you would like to switch versions.
# you may also update the FTP links if you would like a different dbSNP annotation file - but of course that comes with no warranties
dbSNP:
# FTP site: https://ftp.ncbi.nih.gov/snp/
# dbsnp all file
# Check to see if you are using correct assembly with your project
#----------- 37 -------------
# dbsnpVcfDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/All_20180423.vcf.gz
# dbsnpTbiDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/All_20180423.vcf.gz.tbi
# file: All_20180423.vcf.gz
#----------- 38 -------------
dbsnpVcfDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/All_20180418.vcf.gz
dbsnpTbiDownload: https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/All_20180418.vcf.gz.tbi
file: All_20180418.vcf.gz
#####################################
#####################################
#####################################
# # # DEFAULT VARIABLES # # #
#####################################
#####################################
#####################################
# used in Raw Sequence Module, Stored Sequence Module, and SNP Array Module
# Reference Genome Assembly
ref:
# ensembl species name
species: homo_sapiens
# ensembl release
release: 104
# genome build
build: GRCh38
# used in Raw Sequence Module and Stored Sequence Module
# Annotation files for variant calling
NYGC:
# FTP Site: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/
# annotations files are in GRCh38 assembly
nygcUrlPath: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/annotated/
nygcFileStart: CCDG_13607_B01_GRM_WGS_2019-02-19_chr
nygcFileEnd: .recalibrated_variants.annotated.txt
numberOfSplitRegionsFiles: 5
###################################
### --- RAW SEQUENCE MODULE --- ###
# ------------------------------- #
# used to shorten the downloaded directories so file is directly placed in correct folder - will need to edit based on your FTP download path
url:
cutdirs: 5
######################################
### --- STORED SEQUENCE MODULE --- ###
# ---------------------------------- #
# used to shorten the downloaded directories so file is directly placed in correct folder - will need to edit based on your FTP download path
cramUrl:
cutdirs: 5
################################
### --- SNP ARRAY MODULE --- ###
# ---------------------------- #
urlProductFiles:
# product files LINK
# LINK: https://support.illumina.com/downloads/infinium-multi-ethnic-global-8-v1-product-files.html
# manifest file LINK: update to 37 or 38, make sure its BPM file
#----------- 37 -------------
#manifest: https://webdata.illumina.com/downloads/productfiles/multiethnic-global-8/v1-0/infinium-multi-ethnic-global-8-d1-bpm.zip
#mzip: infinium-multi-ethnic-global-8-d1-bpm.zip
#filename: Multi-EthnicGlobal_D1.bpm # for expanding function later
##build: D1 # for expanding function later
#----------- 38 -------------
## LINK: ftp://ussd-ftp.illumina.com/downloads/productfiles/multiethnic-global-8/v1-0/build38
manifest: https://webdata.illumina.com/downloads/productfiles/multiethnic-global-8/v1-0/build38/multi-ethnic-global-8-d2-bpm.zip
mzip: multi-ethnic-global-8-d2-bpm.zip
#filename: Multi-EthnicGlobal_D2.bpm # for expanding function later
##build: D2 # for expanding function later
# cluster file LINK
cluster: https://webdata.illumina.com/downloads/productfiles/multiethnic-global-8/v1-0/infinium-multi-ethnic-global-8-d1-cluster-file.zip
czip: infinium-multi-ethnic-global-8-d1-cluster-file.zip
urlSupportFiles:
# support files LINK
# LINK: https://support.illumina.com/downloads/infinium-multi-ethnic-global-8-v1-support-files.html
#----------- 37 ------------- uncomment 37 section below if you need to use GRCh37 assembly
# # physical and genetic coordinates for 37
#physicalGeneticCoordinates: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/multiethnic-global/multi-ethnic-global-8-d1-physical-genetic-coordinates.zip
#pzip: multi-ethnic-global-8-d1-physical-genetic-coordinates.zip # Multi-EthnicGlobal_D1.csv_Physical-and-Genetic-Coordinates.txt
#----------- 38 ------------- comment block 38 section below if you need to use GRCh37 assembly
# physical and genetic coordinates for 38
physicalGeneticCoordinates: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/multiethnic-global/multi-ethnic-global-8-d2-physical-genetic-coordinates.zip
pzip: multi-ethnic-global-8-d2-physical-genetic-coordinates.zip # Multi-EthnicGlobal_D2.csv_Physical-and-Genetic-Coordinates.txt
# rsids conversion file - Loci Name to rsID
rsidConversion: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/multiethnic-global/multi-ethnic-global-8-d2-b150-rsids.zip
rzip: multi-ethnic-global-8-d2-b150-rsids.zip
rfile: Multi-EthnicGlobal_D2_b150_rsids.txt
Illumina:
# iaap-cli exe path
ftpDownload: ftp://webdata2:webdata2@ussd-ftp.illumina.com/downloads/software/iaap/iaap-cli-linux-x64-1.1.0.tar.gz
DownloadTarFile: iaap-cli-linux-x64-1.1.0.tar.gz
Download: iaap-cli-linux-x64-1.1.0
iaapcli: iaap-cli
#iaapcli: /N/project/WalshWGS/IliadGenomicDataPipeline/Iliad/target_workflow/illumina_gencall/AutoConvert2.0/AutoConvert
################################
### --- SNP ARRAY MODULE --- ###
### - QC VALUE THRESHOLDS - ###
# ---------------------------- #
QCarray:
GenTrainUpperThreshold: 0.7
GenTrainLowerThreshold: 0.67
ClusterSepUpperThreshold: 0.45
ClusterSepLowerThreshold: 0.4
#####################################
#####################################
#####################################
# # # S U B M O D U L E S # # #
#####################################
#####################################
#####################################
# The major submodule named - Lift-and-Merge - can be found above near line 101.
# There are many configurations, checks, and automatic steps that may help users with little experience.
# These more independent and small task workflows below may come in handy for some quick data maneuvers.
MergerSub:
LiftoverSub:
# either point to file in config directory or enter 1 filename for file needing converted
# Indicate which reference assembly you desire to switch your positions
filename: Tatte-Demo
desiredVersion: GRCh38 # switch to GRCh37 if you need to revert from 38 to 37
MergeTargetAndRef: