# module load singularity
# module load Java/17.0.2.lua R/4.2.1

scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g")

ln -s /data/ucct/bi/references/eukaria/homo_sapiens/cache_vep/custom_databases/dbNSFP/GRCh37-4.3/dbNSFP_ENSG_gene_GRCh37.txt
ln -s /data/ucct/bi/references/eukaria/homo_sapiens/cache_vep/custom_databases/dbNSFP/GRCh37-4.3/dbNSFP_ENSG_plugin_hg19.txt

mkdir -p vep
mkdir -p exomiser
mkdir -p logs

#-----------------------------------------------------------------------------------------------------

# 1 . Lablog to modify VCF ID field before running VEP.
echo "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"} {if( \$0 ~ /^#/ ){print \$0}else{printf \"%s\t%s\t%s\t\", \$1,\$2,\$1\"_\"\$2\"_\"\$4\"_\"\$5 ; for (i=4; i<=NF; i++){printf \"%s\t\",\$i} ; printf \"\n\"}}' ../02-postprocessing/variants_fil.vcf > ./vep/variants_fil_mod.vcf" > _01_bcftools_query.sh
echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> _01_bcftools_query.sh

# 2. Create variant table.

echo "singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> _01_bcftools_query.sh
echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> _01_bcftools_query.sh

echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./_01_bcftools_query.sh &" > _01_run_bcftools_query.sh

#------------------------------------------------------------------------------------------------------

# 3. Lablog for annotating whole genome samples using Variant Effect Predictor (VEP).
# Run Vep without the plugin columns

echo "srun --partition short_idx --mem 100G --time 12:00:00 --chdir ${scratch_dir} --output logs/VEP.log --job-name VEP singularity exec -B /data/ucct/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome -B /data/ucct/bi/references/eukaria/homo_sapiens/cache_vep -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/ensembl-vep:103.1--pl5262h4a94de4_2 vep --fasta /data/ucct/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37.fasta -i ${scratch_dir}/vep/variants_fil_mod.vcf -o ${scratch_dir}/vep/vep_annot.vcf --cache --offline --dir_cache /data/ucct/bi/references/eukaria/homo_sapiens/cache_vep/ --everything --assembly GRCh37 --tab &" > _02_vep_annotation.sh

#--------------------------------------------------------------------------------------------------------------------

# 4. Modify vep_annot.vcf in order to eliminate columns that start with ##, rename column #Uploaded_variation as ID

echo "grep -v '^##' ./vep/vep_annot.vcf > ./vep/vep_annot_head.txt" > _03_Vep_plugin_dbNSFP_parse.sh
echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_Vep_plugin_dbNSFP_parse.sh

# 5. Merge plugin columns in dbNSFP_ENSG_plugin.txt to obtain a column named "ID", delete the original columns, keep ID. 
# Rename the file to dbNSFP_ENSG_plugin_Columns and merge it vep_annot_head.txt by the column "ID". 
# Save as vep_plugin.txt. 
# Merge vep_plugin.txt with dbNSFP_ENSG_gene_GRCh37.txt by "Gene" column, save as vep_dbNSFP.txt. 
# Merge vep_dbNSFP.txt with variants.table by "ID" column, save as variants_annot_all.tab

echo "srun --partition short_idx --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh

echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./aux_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh

# 6. Filter variants_annot_all.tab

echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> aux_03_awk.sh

echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> aux_03_awk.sh

echo "rm variants_annot_filterAF.tab" >> aux_03_awk.sh

#-------------------------------------------------------------------------------------------------------

# 9. Running exomiser
vcf_file=$(realpath ../02-postprocessing/variants_fil.vcf | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g")
ped_file=$(realpath ../../../DOC/family.ped | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g")
proband=$(awk 'BEGIN{FS="\t"} $6 == 2 {print $2}' ../../../DOC/family.ped)
bed_file=$(realpath ../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37_mod.bed | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g")
output_folder=$(realpath ./exomiser/exomiser | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g")

sed -i "s|VCF_FILE|${vcf_file}|g" ./exomiser_configfile.yml
sed -i "s|PED_FILE|${ped_file}|g" ./exomiser_configfile.yml
sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile.yml
sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile.yml
sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile.yml

# THE FILE "spring.log" MUST BE DELETED IN THE CORRESPONDING NODE
echo "java -Xms2g -Xmx4g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > aux_04_exomiser_exome.sh
echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/aux_04_exomiser_exome.sh &" > _04_exomiser_exome.sh

#---------------------------------------------------------------------------------------------------------


## Lablog to modify the output reported by exomiser and create a final file with a personalized format.
# Grep variant id for each inheritance model
cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_inheritance.sh

# Grep variants for each inheritance models from the full annotated variants file
cat inheritance_types.txt | xargs -I % echo "grep -f ./id_%.txt ./variants_annot_all.tab > ./vep_annot_%.txt" >> _05_filter_inheritance.sh

cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./vep_annot_%.txt > ./vep_annot_%_final.txt" >> _05_filter_inheritance.sh

echo "rm id_*" >> _05_filter_inheritance.sh
cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_inheritance.sh

# annot_all table is huge, lets shrink it a little bit
echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_inheritance.sh
