# module load singularity

scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g")
mkdir logs

# Location of assemblies to a variable so it only has to be changed here
LOCATION=../*/*/assembly/*/*
# Other databases: 
# /data/ucct/bi/references/BLAST_dbs/nt_20211025/nt
BLAST_DATABASE="/data/ucct/bi/references/BLAST_dbs/viral_ncbi/viral_genomes_ncbi"

# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression)
# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir
echo "Samples that did not generate scaffolds:" > noscaffold.txt
cat ../samples_id.txt | while read in; do
    mkdir ${in}
    # ls will return 0 if there are no scaffolds file
    # NOTE: change extension and location at will
    # NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed
    if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then
        zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa
    else
        # Note assemblies that did not make a scaffold
        zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa
        echo ${in} >> noscaffold.txt
    fi
done 

# NOTE3: change the -query flag to meet your requirements
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% singularity exec -B ${scratch_dir}/../../ -B /data/ucct/bi/references/BLAST_dbs/viral_ncbi/ /data/ucct/bi/pipelines/singularity-images/blast:2.11.0--pl5262h3289130_1 blastn -num_threads 10 -db ${BLAST_DATABASE} -query ${scratch_dir}/%%/%%.scaffolds.fa -out ${scratch_dir}/%%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh

# Filtering criteria:
    # %refCovered > 0.7
    # ref not a phage (stitle ~! /phage/)
    # ref longer than 200 bp (slen > 200)

# First awk: create the full table; second awk: filter it
cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh
echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh
echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh
cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh
echo "rm header" >> _03_gather_results_add_header.sh

# NOTES FOR FILTERING
#
# subject = reference
#
# COLS GENERATED BY US:
#  1: samplename
# GENERATED BY BLAST
#  2: contigname - qseqid
#  3: stitle 
#  4: qaccver
#  5: saccver
#  6: pident
#  7: length (of alignment)
#  8: mismatch
#  9: gaps
# 10: qstart
# 11: qend
# 12: sstart
# 13: send
# 14: evalue
# 15: bitscore
# 16: ref len - slen
# 17: query len - qlen
# 18: qcovs
# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
# GENERATED BY US:
# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes)
# 20: %refCovered: length/slen

# conda activate 2excel
cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /data/ucct/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh
echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /data/ucct/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh
