# -*- coding: utf-8

'''
    This is a snakemake for the metagenomics workflow in the Meren Lab using
    anvi'o.

    It includes the following steps:
    Quality filtering
    Assembly using megahit
    Mapping of metagenomes to assemblies using bowtie2
    generating anvio contigs database (including running hmm profile)
    generating anvio profile database

    The following files must exist in the working directory:
    config.json - this file contains essential configuration information for
    the pipeline.

    samples.txt -
        TAB-delimited file to describe where samples are. The
        header line should be "sample", "r1", and "r2". Each
        row should list the sample name in the first column,
        and full path for r1 and r2.



    An example run of this workflow on the barhal server:
    $ snakemake --snakefile merenlab-metagenomics-pipeline.snakefile \
                --cluster-config cluster.json --cluster 'clusterize  \
                -n {threads} -log {log}' --jobs 4 --latency-wait 100 -p

    Or without --cluster-config and on one line:

    $ snakemake $METAPIPE/merenlab-metagenomics-pipeline.snakefile --cluster 'clusterize -n {threads} -log {log}' --jobs 4 --latency-wait 100 -p

    Note on rule order: whenever the order of rule execution was ambiguous
        mypreferred approach was to use the rule dependencies. See:
        http://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#rule-dependencies

    Note on cluster configuration: because multiple rules require the
    number of threads as input (for example anvi-profile, megahit), and I
    couldn't find a way to make the number of threads from the
    cluster.config file available within rules, then instead I define the
    number of threads within each rule. I'm aware it's less elegant than
    having all cluster configuration in the cluster.json file, and would
    love to learn about an alternative solution if you have one.

    Note on log files: in order for the stdout and stderr to be written
    into log files, I have added `>> {log} 2>&1` to each shell command. if
    running on a cluster, I suggested including something like this in
    your `--cluster` command: `--log {log}`.
'''
import os
import anvio
import argparse
import pandas as pd
import anvio.utils as u
import anvio.workflows as w
import anvio.dbops as dbops
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError, FilesNPathsError
from anvio.workflows.metagenomics import MetagenomicsWorkflow
from anvio.tables.miscdata import TableForLayerAdditionalData

__author__ = "Alon Shaiber"
__copyright__ = "Copyright 2017, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "Alon Shaiber"
__email__ = "alon.shaiber@gmail.com"

run = terminal.Run()
progress = terminal.Progress()

slave_mode = False if 'workflows/metagenomics' in workflow.included[0] else True

if not slave_mode:
    # it is important that this comes before the include statement since
    # M is defined here and will also be used in the contigs workflow
    M = MetagenomicsWorkflow(argparse.Namespace(config=config))
    M.init()
    dirs_dict = M.dirs_dict
    # in order to generate the contigs databases we include the snakefile
    # for the generation of contigs databases
    include: w.get_workflow_snake_file_path('contigs')

# sanity check for requested assembly
# make sure that the user is not requesting multiple assemblers
available_assemblers = M.get_assembly_software_list()
number_of_assemblers_in_config_file = 0
assembly_software_in_config = list()
for a in available_assemblers:
    if M.get_param_value_from_config([a, 'run']) == True:
        number_of_assemblers_in_config_file += 1
        assembly_software_in_config.append(a)
if number_of_assemblers_in_config_file > 1:
    raise ConfigError("This workflow supports the usage of only one assembly "
                       "software, yet all the following software are "
                       "included in your config file: %s. Please include only one." % assembly_software_in_config)


# by default fastq files will be zipped after qc is done
run_gzip_fastqs = M.get_param_value_from_config(['gzip_fastqs', 'run']) == True

run_import_percent_of_reads_mapped = M.get_param_value_from_config(['import_percent_of_reads_mapped', 'run']) == True

run_idba_ud = M.get_param_value_from_config(['idba_ud', 'run'])

run_megahit = M.get_param_value_from_config(['megahit', 'run']) == True

if not M.references_mode and number_of_assemblers_in_config_file!=1 :
    # Sanity check for assembly mode
    # Make sure that user specified an assembler
    raise ConfigError("You didn't specify any assembler for your metagenomes, and yet "
                      "your config file shows you are not using references_mode. If you "
                      "already have your assemblies or reference fasta files, then maybe "
                      "you need to add '\"references_mode\": true', and a '\"fasta_txt\": "
                      "\"name-of-your-fasta-txt-file\"' to your config file. Otherwise, "
                      "if you do mean to use assembly, please set 'run: true' for one of the "
                      "following available assemblers: %s" % ', '.join(available_assemblers))

ruleorder: import_percent_of_reads_mapped > import_krakenuniq_taxonomy

rule metagenomics_workflow_target_rule:
    '''
        The target rule for the workflow.

        The final product of the workflow is an anvi'o merged profile directory
        for each group
    '''
    input: M.get_metagenomics_target_files()


rule iu_gen_configs:
    '''
        Generating a config file for each sample.

        Notice that this step is ran only once and generates the config files for all samples
    '''
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/iu_gen_configs.log"
    # the input file is marked as 'ancient' so snakemake wouldn't run it
    # just because a new path-to-raw-fastq-files.txt file was created.
    input: ancient(M.get_param_value_from_config(['samples_txt']))
    output: expand("{DIR}/{sample}.ini", DIR=dirs_dict["QC_DIR"], sample=M.sample_names)
    params:
        dir=dirs_dict["QC_DIR"],
        r1_prefix = M.get_rule_param("iu_gen_configs", "--r1-prefix"),
        r2_prefix = M.get_rule_param("iu_gen_configs", "--r2-prefix")
    threads: M.T('iu_gen_configs')
    resources: nodes = M.T('iu_gen_configs'),
    shell: "iu-gen-configs {input} -o {params.dir} {params.r2_prefix} {params.r1_prefix} >> {log} 2>&1"


def get_raw_fastq(sample):
    ''' return a dict with the path to the raw fastq files'''
    r1 = list(M.samples_information[M.samples_information["sample"] == sample]['r1'])[0].split(',')
    r2 = list(M.samples_information[M.samples_information["sample"] == sample]['r2'])[0].split(',')
    return {'r1': r1, 'r2': r2}


def get_fastq(sample, zipped=run_gzip_fastqs, post_reference_based_short_read_removal=M.remove_short_reads_based_on_references):
    ''' return the pair of compressed fastq files for a sample.

        There are two types of sources for the fastq:
            1. The output of QC.
            2. From the specified paths in samples.txt (in the case the user
                                                        chose to skip QC).
        This helper function returns the appropriate paths according to the
        config file.
    '''
    fastq_label = "-QUALITY_PASSED"
    if post_reference_based_short_read_removal:
        fastq_label = "-FILTERED"

    if post_reference_based_short_read_removal or M.run_qc:
        # by default, use the output of the reference based short read removal
        if zipped:
            r1 = os.path.join(dirs_dict["QC_DIR"], sample + fastq_label + "_R1.fastq.gz")
            r2 = os.path.join(dirs_dict["QC_DIR"], sample + fastq_label + "_R2.fastq.gz")
        else:
            r1 = os.path.join(dirs_dict["QC_DIR"], sample + fastq_label + "_R1.fastq")
            r2 = os.path.join(dirs_dict["QC_DIR"], sample + fastq_label + "_R2.fastq")
        d = {'r1': r1, 'r2': r2}
    else:
        # if no QC and no reference based short read removal is requested, use raw input
        d = get_raw_fastq(sample)
        if len(d['r1']) > 1 or len(d['r2']) > 1:
            raise ConfigError('You are skipping QC, but some of your samples (for example %s) are composed of more than 1 pair of fastq files '
                  'this is not allowed. Please first merge all the fastq files corresponding to a single sample (or just use QC, and the '
                  'merging will be done for you).' % sample)
        d = {'r1': d['r1'][0],
             'r2': d['r2'][0]}
    return d


def input_for_qc(sample):
    ''' return a dict with input for qc rule'''
    d = {'ini': ancient(dirs_dict["QC_DIR"] + "/%s.ini" % sample)}
    d.update(get_raw_fastq(sample))
    return d


qc_output_r1 = os.path.join(dirs_dict["QC_DIR"], "{sample}-QUALITY_PASSED_R1.fastq")
qc_output_r2 = os.path.join(dirs_dict["QC_DIR"], "{sample}-QUALITY_PASSED_R2.fastq")
rule iu_filter_quality_minoche:
    ''' Run QC using iu-filter-quality-minoche '''
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{sample}-iu_filter_quality_minoche.log"
    # making the config file as "ancient" so QC wouldn't run just because
    # a new config file was produced.
    input: unpack(lambda wildcards: input_for_qc(wildcards.sample))
    output:
        r1 = temp(qc_output_r1) if M.remove_short_reads_based_on_references \
                                else qc_output_r1,
        r2 = temp(qc_output_r2) if M.remove_short_reads_based_on_references \
                                else qc_output_r2,
        stats = dirs_dict["QC_DIR"] + "/{sample}-STATS.txt"
    params:
        ignore_deflines = M.get_rule_param("iu_filter_quality_minoche", "--ignore-deflines"),
        visualize_quality_curves = M.get_rule_param("iu_filter_quality_minoche", "--visualize-quality-curves"),
        limit_num_pairs = M.get_rule_param("iu_filter_quality_minoche", "--limit-num-pairs"),
        print_qual_scores = M.get_rule_param("iu_filter_quality_minoche", "--print-qual-scores"),
        store_read_fate = M.get_rule_param("iu_filter_quality_minoche", "--store-read-fate")
    threads: M.T('iu_filter_quality_minoche')
    resources: nodes = M.T('iu_filter_quality_minoche'),
    shell:
        """
            iu-filter-quality-minoche {input.ini} {params.store_read_fate}\
                                      {params.print_qual_scores} {params.limit_num_pairs}\
                                      {params.visualize_quality_curves} {params.ignore_deflines} >> {log} 2>&1
        """


def need_to_uncompress_fastqs(r1, r2):
    # Checking if any of the input files are compressed
    # this should only happen if QC is not performed in this snakemake session
    # because if QC is performed in this session then the output of iu-filter-quality-minoche
    # is not compressed and snakemake will schedule fq2fa rule before gzip rule
    files_that_end_with_gz = [f for f in [r1, r2] if f.endswith('.gz')]
    if len(files_that_end_with_gz) == 1:
        raise ConfigError("Something seems very bad: one of the pair fastq files "
                          "is compressed and the other one is not. This is the "
                          "compressed one: %s" % files_that_end_with_gz[0])
    elif len(files_that_end_with_gz) == 2:
        run.warning("The following fastq files are compressed and will now "
                    "be uncompressed using gunzip: %s." % files_that_end_with_gz)
        return True
    return False


def uncompress_fastqs_if_needed(r1, r2, log):
    if need_to_uncompress_fastqs(r1, r2):
        r1_unzipped = r1.replace('.fastq.gz', '_temp.fastq')
        r2_unzipped = r2.replace('.fastq.gz', '_temp.fastq')
        shell("gunzip < %s > %s 2>> %s" % (r1, r1_unzipped, log))
        shell("gunzip < %s > %s 2>> %s" % (r2, r2_unzipped, log))
        r1 = r1_unzipped
        r2 = r2_unzipped
    return r1, r2


def input_for_remove_short_reads_based_on_references(wildcards):
    d = {}
    d['bam'] = expand(os.path.join(dirs_dict["MAPPING_DIR"], "{group}", "{sample}.bam"), \
                      group=list(M.references_for_removal.keys()), sample=wildcards.sample)
    d.update(get_fastq(wildcards.sample, post_reference_based_short_read_removal=False))
    return d


gzip_suffix = '.gz' if run_gzip_fastqs else ''
rule remove_short_reads_based_on_references:
    version: 1.0
    log: os.path.join(dirs_dict["LOGS_DIR"], "{sample}-remove_short_reads_based_on_references.log")
    input: unpack(input_for_remove_short_reads_based_on_references)
    output:
        ids_to_remove = os.path.join(dirs_dict["QC_DIR"], '{sample}-ids-to-remove.txt'),
        # For each one of the four following outputs we have an alternative
        # file name for the case that the user chose dont_remove_just_map
        # These mock files end with '-mock-output'
        keep1 = os.path.join(dirs_dict["QC_DIR"], "{sample}-FILTERED_R1.fastq" + gzip_suffix) \
                    if M.remove_short_reads_based_on_references else temp(os.path.join(dirs_dict["QC_DIR"], '{sample}-mock-output1')),
        remove1 = temp(os.path.join(dirs_dict["QC_DIR"], "{sample}.R1.fastq.removed")) \
                    if M.remove_short_reads_based_on_references else temp(os.path.join(dirs_dict["QC_DIR"], '{sample}-mock-output2')),
        keep2 = os.path.join(dirs_dict["QC_DIR"], "{sample}-FILTERED_R2.fastq" + gzip_suffix) \
                    if M.remove_short_reads_based_on_references else temp(os.path.join(dirs_dict["QC_DIR"], '{sample}-mock-output3')),
        remove2 = temp(os.path.join(dirs_dict["QC_DIR"], "{sample}.R2.fastq.removed")) \
                    if M.remove_short_reads_based_on_references else temp(os.path.join(dirs_dict["QC_DIR"], '{sample}-mock-output4')),
    params:
        delimiter = M.get_param_value_from_config(['remove_short_reads_based_on_references', 'delimiter-for-iu-remove-ids-from-fastq'])
    threads: M.T('remove_short_reads_based_on_references')
    resources: nodes = M.T('remove_short_reads_based_on_references')
    run:
        references = M.references_for_removal.keys()
        for bam in input.bam:
            # merge the ids from matching any reference
            shell('samtools view %s | cut -f 1 >> %s 2>>{log}' % (bam, output.ids_to_remove))
        ids_to_remove = set(open(output.ids_to_remove).read().splitlines())
        with open(output.ids_to_remove, 'w') as f:
            for item in ids_to_remove:
                f.write("%s\n" % item)
        if M.remove_short_reads_based_on_references:
            if ids_to_remove:
                need_to_uncompress = need_to_uncompress_fastqs(input.r1, input.r2)
                r1, r2 = uncompress_fastqs_if_needed(input.r1, input.r2, log)
                # remove ids
                intermediate_fastq = {"1": r1, "2": r2}
                output_fastq_files = {'remove': {'1': output.remove1, '2': output.remove2},\
                                      'keep': {'1': output.keep1, '2': output.keep2}}
                output_prefix = dirs_dict["QC_DIR"]
                for i in intermediate_fastq:
                    # first make sure that the expected output files don't already exist
                    # if they exist, then we delete them, since they are probably left from previous failed run
                    shell('rm -rf {fq}.removed {fq}.survived'.format(fq=intermediate_fastq[i]))
                    shell('iu-remove-ids-from-fastq -i %s -l {output.ids_to_remove} -d "{params.delimiter}"' % intermediate_fastq[i])
                    shell('mv %s.removed %s' % (intermediate_fastq[i], output_fastq_files['remove'][i]))
                    if run_gzip_fastqs:
                        shell("gzip < %s.survived > %s" % (intermediate_fastq[i], output_fastq_files['keep'][i]))
                        shell("rm %s.survived" % intermediate_fastq[i])
                    else:
                        shell("mv %s.survived %s" % (intermediate_fastq[i], output_fastq_files['keep'][i]))
                if need_to_uncompress:
                    shell('rm %s %s' % (r1, r2))
            else:
                # No reads from this sample were mapped to references so we will just:
                # change the name of the input fastq to the expected output fastq
                # and then create mock files for the "removed" fastq files (just so snakemake is happy and sees all expected output)
                shell('mv {input.r1} {output.keep1}')
                shell('mv {input.r2} {output.keep2}')
                shell('touch {output.remove1} {output.remove2}')
        else:
            # we only need the list of ids that matched the references
            # we are not removing enything so we just create mock output files to make snakemake happy
            shell('touch {output.keep1} {output.keep2} {output.remove1} {output.remove2}')


rule gen_report_for_mapping_to_references_for_removal:
    version: 1.0
    log: os.path.join(dirs_dict["LOGS_DIR"], "gen_report_for_mapping_to_references_for_removal.log")
    input:
        ids_to_remove = expand(os.path.join(dirs_dict["QC_DIR"], '{sample}-ids-to-remove.txt'), sample=M.sample_names),
    output:
        report_file = os.path.join(dirs_dict["QC_DIR"], "short-read-removal-report.txt")
    params:
    threads: 1
    resources: nodes=1
    run:
        M.gen_report_with_references_for_removal_info(input, output[0])


rule gen_qc_report:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/gen_qc_report.log"
    input: expand(dirs_dict["QC_DIR"] + "/{sample}-STATS.txt", sample=M.sample_names)
    output: dirs_dict["QC_DIR"] + "/qc-report.txt"
    threads: M.T('gen_qc_report')
    resources: nodes = M.T('gen_qc_report')
    run:
        report_dict = {}
        report_column_headers = ['number of pairs analyzed',
                                 'total pairs passed',
                                 'total pairs passed (percent of all pairs)',
                                 'total pair_1 trimmed',
                                 'total pair_1 trimmed (percent of all passed pairs)',
                                 'total pair_2 trimmed',
                                 'total pair_2 trimmed (percent of all passed pairs)',
                                 'total pairs failed',
                                 'total pairs failed (percent of all pairs)',
                                 'pairs failed due to pair_1',
                                 'pairs failed due to pair_1 (percent of all failed pairs)',
                                 'pairs failed due to pair_2',
                                 'pairs failed due to pair_2 (percent of all failed pairs)',
                                 'pairs failed due to both',
                                 'pairs failed due to both (percent of all failed pairs)',
                                 'FAILED_REASON_P',
                                 'FAILED_REASON_P (percent of all failed pairs)',
                                 'FAILED_REASON_N',
                                 'FAILED_REASON_N (percent of all failed pairs)',
                                 'FAILED_REASON_C33',
                                 'FAILED_REASON_C33 (percent of all failed pairs)']
        for filename in input:
            sample = os.path.basename(filename).split("-STATS.txt")[0]
            report_dict[sample] = dict.fromkeys(report_column_headers, 0)
            with open(filename,'r') as f:
                firstline = True
                for line in f.readlines():
                    s1 = line.split(':')
                    numeric_header = s1[0].strip()
                    s2 = s1[1].split('(')
                    numeric = s2[0].strip()
                    report_dict[sample][numeric_header] = numeric
                    if not firstline:
                        s3 = s2[1].split(' ')
                        percent = s3[0].strip('%')
                        percent_header = numeric_header + " (percent " + " ".join(s3[1:])
                        percent_header = percent_header.strip()
                        report_dict[sample][percent_header] = percent
                    else:
                        firstline = False
        u.store_dict_as_TAB_delimited_file(report_dict, output[0], headers= ["sample"] + report_column_headers)


def fastq_input_for_assembly(wildcards):
    ''' Creating a dictionary containing the path to input fastq file.
        The reason we can't use get_fastq is because we need to get
        the fastq files for all samples that belong to a group, and
        get_fastq only gives the pair of fastq file for one sample
    '''
    samples = list(M.samples_information[M.samples_information["group"] == wildcards.group]["sample"])
    r1 = []
    r2 = []
    for sample in samples:
        d = get_fastq(sample)
        r1.append(d['r1'])
        r2.append(d['r2'])

    return {'r1': r1, 'r2': r2}


rule merge_fastas_for_co_assembly:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-merge_fastas_for_co_assembly.log"
    input: unpack(fastq_input_for_assembly)
    output: temp(dirs_dict["QC_DIR"] + "/{group}-merged.fa")
    threads: M.T('merge_fastas_for_co_assembly')
    resources: nodes = M.T('merge_fastas_for_co_assembly')
    run:
        concatenate = False
        if len(input.r1) > 1:
            concatenate = True
        for r1, r2 in zip(input.r1, input.r2):
            need_to_uncompress = need_to_uncompress_fastqs(r1, r2)
            r1, r2 = uncompress_fastqs_if_needed(r1, r2, log)

            # run fq2fa
            prefix1 = os.path.basename(r1).split('.fastq')[0]
            prefix2 = os.path.basename(r2).split('.fastq')[0]
            temp_merged_name = prefix1 + prefix2 + '.temp.fa'
            fq2fa_output = os.path.join(dirs_dict["QC_DIR"], temp_merged_name)
            shell("fq2fa --merge %s %s %s >> %s 2>&1" % (r1, r2, fq2fa_output, log))

            if need_to_uncompress:
                # if we had to unzip the fastq files then now we delete the unzipped file
                shell("rm %s %s 2>>%s" % (r1, r2, log))

            if concatenate:
                # concatenate
                shell("cat %s >> %s 2>>%s" % (fq2fa_output, output, log))
                # delete individual fasta file
                shell("rm %s 2>>%s" % (fq2fa_output, log))
            else:
                shell("mv %s %s 2>>%s" % (fq2fa_output, output, log))


rule merge_fastqs_for_co_assembly:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-merge_fastqs_for_co_assembly.log"
    input: unpack(fastq_input_for_assembly)
    output:
        r1 = temp(os.path.join(dirs_dict["QC_DIR"], "{group}-merged_R1.fastq")),
        r2 = temp(os.path.join(dirs_dict["QC_DIR"], "{group}-merged_R2.fastq"))
    threads: M.T('merge_fastqs_for_co_assembly')
    resources: nodes = M.T('merge_fastqs_for_co_assembly')
    run:
        for r1, r2 in zip(input.r1, input.r2):
            need_to_uncompress = need_to_uncompress_fastqs(r1, r2)
            r1, r2 = uncompress_fastqs_if_needed(r1, r2, log)

            # concatenate
            shell("cat %s >> %s 2>>%s" % (r1, output.r1, log))
            shell("cat %s >> %s 2>>%s" % (r2, output.r2, log))

            if need_to_uncompress:
                # if we had to unzip the fastq files then now we delete the unzipped file
                shell("rm %s %s 2>>%s" % (r1, r2, log))

gzip_fastq_output = os.path.join(dirs_dict["QC_DIR"],\
                                 "{sample}-QUALITY_PASSED_{R}.fastq.gz")
rule gzip_fastqs:
    ''' Compressing the quality controlled fastq files'''
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{sample}-{R}-gzip.log"
    input:
        fastq = os.path.join(dirs_dict["QC_DIR"], "{sample}-QUALITY_PASSED_{R}.fastq")
    output: temp(gzip_fastq_output) if M.remove_short_reads_based_on_references \
                                else gzip_fastq_output
    threads: M.T('gzip_fastqs')
    resources: nodes = M.T('gzip_fastqs'),
    shell: "gzip {input.fastq} >> {log} 2>&1"


if run_idba_ud == True:
    rule idba_ud:
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-idba_ud.log"
        input:
            fasta = dirs_dict["QC_DIR"] + "/{group}-merged.fa"
        output:
            contigs = temp(os.path.join(dirs_dict["FASTA_DIR"], "{group}", "final.contigs.fa")),
            # idba_ud generates both contigs.fa and scaffolds.fa
            # the user can choose which one will be used for the rest of the analysis
            # the other (unused) file will still be kept in it's raw form just in case the user wants it.
            raw_contigs_or_scaffold = os.path.join(dirs_dict["FASTA_DIR"], "{group}", "contigs.fa") if M.use_scaffold_from_idba_ud \
                                               else os.path.join(dirs_dict["FASTA_DIR"], "{group}", "scaffolds.fa")
        params:
            temp_dir = temp(dirs_dict["FASTA_DIR"] + "/{group}_TEMP"),
            mink = M.get_rule_param("idba_ud", "--mink"),
            maxk = M.get_rule_param("idba_ud", "--maxk"),
            step = M.get_rule_param("idba_ud", "--step"),
            inner_mink = M.get_rule_param("idba_ud", "--inner_mink"),
            inner_step = M.get_rule_param("idba_ud", "--inner_step"),
            prefix = M.get_rule_param("idba_ud", "--prefix"),
            min_count = M.get_rule_param("idba_ud", "--min_count"),
            min_support = M.get_rule_param("idba_ud", "--min_support"),
            seed_kmer = M.get_rule_param("idba_ud", "--seed_kmer"),
            min_contig = M.get_rule_param("idba_ud", "--min_contig"),
            similar = M.get_rule_param("idba_ud", "--similar"),
            max_mismatch = M.get_rule_param("idba_ud", "--max_mismatch"),
            min_pairs = M.get_rule_param("idba_ud", "--min_pairs"),
            no_bubble = M.get_rule_param("idba_ud", "--no_bubble"),
            no_local = M.get_rule_param("idba_ud", "--no_local"),
            no_coverage = M.get_rule_param("idba_ud", "--no_coverage"),
            no_correct = M.get_rule_param("idba_ud", "--no_correct"),
            pre_correction = M.get_rule_param("idba_ud", "--pre_correction"),
        threads: M.T('idba_ud')
        resources: nodes = M.T('idba_ud')
        run:
            cmd = "idba_ud -o {params.temp_dir} --read {input.fasta} --num_threads {threads} " + \
                  "{params.mink} {params.maxk} {params.step} {params.inner_mink} " + \
                  "{params.inner_step} {params.prefix} {params.min_count} " + \
                  "{params.min_support} {params.seed_kmer} {params.min_contig} " + \
                  "{params.similar} {params.max_mismatch} {params.min_pairs} " + \
                  "{params.no_bubble} {params.no_local} {params.no_coverage} " + \
                  "{params.no_correct} {params.pre_correction} >> {log} 2>&1"
            shell("echo Running the following command: >> {log} 2>&1")
            shell("echo %s >> {log} 2>&1" % cmd)
            shell(cmd)

            if M.use_scaffold_from_idba_ud:
                # we will use scaffolds for the rest of the analysis, but still keep the raw contigs output in case the user will want it later on.
                use = "scaffold.fa"
                keep = "contig.fa"
            else:
                # we will use contigs and keep the raw scaffolds output
                use = "contig.fa"
                keep = "scaffold.fa"

            shell("mv %s %s" % (os.path.join(params.temp_dir, use), output.contigs))
            shell("mv %s %s" % (os.path.join(params.temp_dir, keep), output.raw_contigs_or_scaffold))
            shell("rm -rf {params.temp_dir}")


def input_for_metaspades(wildcards):
    '''
        The purpose of this functino is to figure out whether co-assembly is done.
        if co-assembly is done then we need to first merge the fastq files
    '''
    input_file_dict = {}
    d = fastq_input_for_assembly(wildcards)
    if len(d['r1']) > 1:
        input_file_dict = {'r1': temp(os.path.join(dirs_dict["QC_DIR"], wildcards.group + "-merged_R1.fastq")),\
                           'r2': temp(os.path.join(dirs_dict["QC_DIR"], wildcards.group + "-merged_R2.fastq"))}
    else:
        input_file_dict = {'r1': d['r1'],\
                           'r2': d['r2']}
    return input_file_dict


if M.run_metaspades == True:
    rule metaspades:
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-metaspades.log"
        input: unpack(input_for_metaspades)
        output:
            contigs = temp(os.path.join(dirs_dict["FASTA_DIR"], "{group}", "final.contigs.fa")),
            # metaspades generates both contigs.fasta and scaffolds.fasta
            # the user can choose which one will be used for the rest of the analysis
            # the other (unused) file will still be kept in it's raw form just in case the user wants it.
            raw_contigs_or_scaffold = os.path.join(dirs_dict["FASTA_DIR"], "{group}", "contigs.fasta") if M.use_scaffold_from_metaspades \
                                               else os.path.join(dirs_dict["FASTA_DIR"], "{group}", "scaffolds.fasta")
        params:
            temp_dir = dirs_dict["FASTA_DIR"] + "/{group}_TEMP",
            additional_params = M.get_param_value_from_config(["metaspades", "additional_params"])
        threads: M.T('metaspades')
        resources: nodes = M.T('metaspades')
        run:
            cmd = "metaspades.py -1 {input.r1} -2 {input.r2} " + \
                  "-t {threads} " + \
                  "{params.additional_params} " + \
                  "-o {params.temp_dir} >> {log} 2>&1"

            shell(cmd)

            if M.use_scaffold_from_metaspades:
                # we will use scaffolds for the rest of the analysis, but still keep the raw contigs output in case the user will want it later on.
                use = "scaffolds.fasta"
                keep = "contigs.fasta"
            else:
                # we will use contigs and keep the raw scaffolds output
                use = "contigs.fasta"
                keep = "scaffolds.fasta"

            shell("mv %s %s" % (os.path.join(params.temp_dir, use), output.contigs))
            shell("mv %s %s" % (os.path.join(params.temp_dir, keep), output.raw_contigs_or_scaffold))
            shell("rm -rf {params.temp_dir}")


if M.get_param_value_from_config(['megahit', 'run']) == True:
    rule megahit:
        '''
            Assembling fastq files using megahit.

            All files created by megahit are stored in a temporary folder,
            and only the fasta file is kept for later analysis.
        '''
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-megahit.log"
        input: unpack(fastq_input_for_assembly)
        params:
            temp_dir = dirs_dict["FASTA_DIR"] + "/{group}_TEMP",
            # the minimum length for contigs (smaller contigs will be discarded)
            min_contig_len = M.get_rule_param("megahit", "--min-contig-len"),
            min_count = M.get_rule_param("megahit", "--min-count"),
            k_min = M.get_rule_param("megahit", "--k-min"),
            k_max = M.get_rule_param("megahit", "--k-max"),
            k_step = M.get_rule_param("megahit", "--k-step"),
            k_list = M.get_rule_param("megahit", "--k-list"),
            no_mercy = M.get_rule_param("megahit", "--no-mercy"),
            no_bubble = M.get_rule_param("megahit", "--no-bubble"),
            merge_level = M.get_rule_param("megahit", "--merge-level"),
            prune_level = M.get_rule_param("megahit", "--prune-level"),
            prune_depth = M.get_rule_param("megahit", "--prune-depth"),
            low_local_ratio = M.get_rule_param("megahit", "--low-local-ratio"),
            max_tip_len = M.get_rule_param("megahit", "--max-tip-len"),
            no_local = M.get_rule_param("megahit", "--no-local"),
            kmin_1pass = M.get_rule_param("megahit", "--kmin-1pass"),
            presets = M.get_rule_param("megahit", "--presets"),
            memory = M.get_rule_param("megahit", "--memory"),
            mem_flag = M.get_rule_param("megahit", "--mem-flag"),
            use_gpu = M.get_rule_param("megahit", "--use-gpu"),
            gpu_mem = M.get_rule_param("megahit", "--gpu-mem"),
            keep_tmp_files = M.get_rule_param("megahit", "--keep-tmp-files"),
            tmp_dir = M.get_rule_param("megahit", "--tmp-dir"),
            _continue = M.get_rule_param("megahit", "--continue"),
            verbose = M.get_rule_param("megahit", "--verbose"),
        # Notice that megahit requires a directory to be specified as
        # output. If the directory already exists then megahit will not
        # run. To avoid this, the for megahit is a temporary directory,
        # once megahit is done running then the contigs database is moved
        # to the final location.
        output:
            contigs = temp(dirs_dict["FASTA_DIR"] + "/{group}/final.contigs.fa")
        threads: M.T('megahit')
        resources: nodes = M.T('megahit'),
        # Making this rule a shadow rule so all extra files created by megahit
        # are not retaineded (it is not enough to define the directory as temporary
        # because when failing in the middle of a run, snakemake doesn't delete directories)
        run:
            r1 = ','.join(input.r1)
            r2 = ','.join(input.r2)

            cmd = "megahit -1 %s -2 %s " % (r1, r2) + \
                "-o {params.temp_dir} " + \
                "-t {threads} " + \
                "{params.min_contig_len} {params.min_count} {params.k_min} " + \
                "{params.k_max} {params.k_step} {params.k_list} {params.no_mercy} " + \
                "{params.no_bubble} {params.merge_level} {params.prune_level} " + \
                "{params.prune_depth} {params.low_local_ratio} {params.max_tip_len} " + \
                "{params.no_local} {params.kmin_1pass} {params.presets} {params.memory} " + \
                "{params.mem_flag} {params.use_gpu} {params.gpu_mem} {params.keep_tmp_files} " + \
                "{params.tmp_dir} {params._continue} {params.verbose} >> {log} 2>&1"
            print("Running: %s" % cmd)
            shell(cmd)
            shell("mv {params.temp_dir}/final.contigs.fa {output.contigs} >> {log} 2>&1")
            shell("rm -rf {params.temp_dir}")


def build_expected_bowtie_build_output_file(rev=False):
    """Build string template for the expected output file of bowtie build

    Returns a string such as 'some_dir/{group}/{group}-contigs.rev.{i}.bt2'

    Parameters
    ==========
    rev : bool
        Is it a .rev file?
    """
    additional_params = M.get_param_value_from_config(["bowtie_build", "additional_params"])
    ext_suffix = 'l' if '--large-index' in additional_params else ''
    prefix = '.rev' if rev else ''
    return dirs_dict["MAPPING_DIR"] + "/{group}/{group}-contigs" + prefix + '.{i}.bt2' + ext_suffix


rule bowtie_build:
    """Run bowtie-build on the contigs fasta"""
    # TODO: consider running this as a shadow rule
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-bowtie_build.log"
    input: M.get_fasta
    # I touch this file because the files created have different suffix
    output:
        o1 = expand(build_expected_bowtie_build_output_file(), i=[1,2,3,4], group="{group}"),
        o2 = expand(build_expected_bowtie_build_output_file(rev=True), i=[1,2], group="{group}")
    params:
        prefix = dirs_dict["MAPPING_DIR"] + "/{group}/{group}-contigs",
        additional_params = M.get_param_value_from_config(["bowtie_build", "additional_params"])
    threads: M.T('bowtie_build')
    resources: nodes = M.T('bowtie_build'),
    shell: "bowtie2-build {input} {params.prefix} --threads {threads} {params.additional_params} >> {log} 2>&1"


def input_for_bowtie(wildcards):
    '''Creating a dictionary containing the input files for bowtie.'''
    d = {'build_output': build_expected_bowtie_build_output_file().format(i=1, group=wildcards.group)}
    if wildcards.group in M.references_for_removal:
        post_reference_based_short_read_removal = False
    else:
        post_reference_based_short_read_removal = M.remove_short_reads_based_on_references
    # add the fastq files paths to the dictionary:
    d.update(get_fastq(wildcards.sample, post_reference_based_short_read_removal=post_reference_based_short_read_removal))
    return d


rule bowtie:
    """Run mapping with bowtie2"""
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-{sample}-bowtie.log"
    input: unpack(input_for_bowtie)
    # setting the output as temp, since we only want to keep the bam file.
    output:
        sam = temp(dirs_dict["MAPPING_DIR"] + "/{group}/{sample}.sam")
    params:
        bowtie_build_prefix = rules.bowtie_build.params.prefix,
        additional_params = M.get_param_value_from_config(["bowtie", "additional_params"])
    threads: M.T('bowtie')
    resources: nodes = M.T('bowtie'),
    shell:
        """
            bowtie2 --threads {threads} -x {params.bowtie_build_prefix} -1 {input.r1} -2 {input.r2} \
            {params.additional_params} -S {output.sam} >> {log} 2>&1
        """


rule samtools_view:
    """ sort sam file with samtools and create a RAW.bam file"""
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-{sample}-samtools_view.log"
    input: rules.bowtie.output.sam
    params: additional_params = M.get_param_value_from_config(["samtools_view", "additional_params"])
    # output as temp. we only keep the final bam file
    output: temp(dirs_dict["MAPPING_DIR"] + "/{group}/{sample}-RAW.bam")
    threads: M.T('samtools_view')
    resources: nodes = M.T('samtools_view'),
    shell: "samtools view -bS {input} -o {output} {params.additional_params} >> {log} 2>&1"


rule anvi_init_bam:
    """ run anvi-init-bam on RAW bam file to create a bam file ready for anvi-profile"""
    version: 1.0 # later we can decide if we want the version to use the version of anvi'o
    log: dirs_dict["LOGS_DIR"] + "/{group}-{sample}-anvi_init_bam.log"
    input: rules.samtools_view.output
    output:
        bam = dirs_dict["MAPPING_DIR"] + "/{group}/{sample}.bam",
        bai = dirs_dict["MAPPING_DIR"] + "/{group}/{sample}.bam.bai"
    threads: M.T('anvi_init_bam')
    resources: nodes = M.T('anvi_init_bam'),
    shell: "anvi-init-bam {input} -o {output.bam} -T {threads} >> {log} 2>&1"


sample_name = M.get_param_value_from_config(['anvi_profile', '--sample-name'])
if sample_name != M.default_config['anvi_profile']['--sample-name'] and sample_name is not None:
    run.warning('You chose to set the "--sample-name" for your profile databases '
                'in the config file to %s. You are welcomed to do that, but at your own '
                'risk. Just so you know, by default the sample name would match '
                'the name defined either in the samples_txt, by choosing to provide '
                'a different name, it means that all your profile databases would have '
                'the same name, unless you incloded "{sample}" in the name you provided '
                'but even then, we did not test that option and we are not sure it would '
                'work...' % sample_name)


def get_cluster_contigs_param(wildcards):
    """ helper function to sort out whether to cluster contigs for a single profile database"""
    if M.get_param_value_from_config(['anvi_profile', '--cluster-contigs']):
        run.warning('You chose to set the value for --cluster-contigs as %s. You can do that if you '
                    'choose to, but just so you know, if you don\'t provide a value then '
                    'the workflow would automatically cluster contigs for profile databases '
                    'that are not merged (i.e. profiles that belong to groups of size 1).' \
                    % M.get_param_value_from_config(['anvi_profile', '--cluster-contigs']))
        cluster_contigs = M.get_rule_param('anvi_profile', '--cluster-contigs')
    else:
        # if profiling to individual assembly then clustering contigs
        # see --cluster-contigs in the help manu of anvi-profile
        cluster_contigs = '--cluster-contigs' if M.group_sizes[wildcards.group] == 1 else ''
    return cluster_contigs


rule anvi_profile:
    """ run anvi-profile on the bam file"""
    # setting the rule version to be as the version of the profile database of anvi'o
    version: anvio.__profile__version__
    log: dirs_dict["LOGS_DIR"] + "/{group}-{sample}-anvi_profile.log"
    input:
        bam = dirs_dict["MAPPING_DIR"] + "/{group}/{sample}.bam",
        # marking the contigs.db as ancient in order to ignore timestamps.
        contigs = ancient(M.get_contigs_db_path())
    output:
        profile = dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/PROFILE.db",
        AUXILIARY_DATA = dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/AUXILIARY-DATA.db",
        runlog = dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/RUNLOG.txt"
    params:
        output_dir = dirs_dict["PROFILE_DIR"] + "/{group}/{sample}",
        cluster_contigs = get_cluster_contigs_param,
        sample_name = M.get_rule_param("anvi_profile", "--sample-name"),
        overwrite_output_destinations = M.get_rule_param("anvi_profile", "--overwrite-output-destinations"),
        report_variability_full = M.get_rule_param("anvi_profile", "--report-variability-full"),
        skip_SNV_profiling = M.get_rule_param("anvi_profile", "--skip-SNV-profiling"),
        profile_SCVs = M.get_rule_param("anvi_profile", "--profile-SCVs"),
        description = M.get_rule_param("anvi_profile", "--description"),
        skip_hierarchical_clustering = M.get_rule_param("anvi_profile", "--skip-hierarchical-clustering"),
        distance = M.get_rule_param("anvi_profile", "--distance"),
        linkage = M.get_rule_param("anvi_profile", "--linkage"),
        min_contig_length = M.get_rule_param("anvi_profile", "--min-contig-length"),
        min_mean_coverage = M.get_rule_param("anvi_profile", "--min-mean-coverage"),
        min_coverage_for_variability = M.get_rule_param("anvi_profile", "--min-coverage-for-variability"),
        contigs_of_interest = M.get_rule_param("anvi_profile", "--contigs-of-interest"),
        queue_size = M.get_rule_param("anvi_profile", "--queue-size"),
        write_buffer_size_per_thread = M.get_rule_param("anvi_profile", "--write-buffer-size-per-thread"),
        max_contig_length = M.get_rule_param("anvi_profile", "--max-contig-length"),
        max_coverage_depth = M.get_rule_param("anvi_profile", "--max-coverage-depth"),
    threads: M.T('anvi_profile')
    resources: nodes = M.T('anvi_profile'),
    shell:
        """
            anvi-profile -i {input.bam} -c {input.contigs} -o {params.output_dir} \
                             {params.cluster_contigs} {params.min_contig_length} \
                             {params.sample_name} -T {threads} {params.overwrite_output_destinations} \
                             {params.profile_SCVs} {params.report_variability_full} \
                             {params.skip_SNV_profiling} {params.description} \
                             {params.skip_hierarchical_clustering} {params.distance} \
                             {params.linkage} {params.min_mean_coverage} \
                             {params.max_coverage_depth} \
                             {params.min_coverage_for_variability} {params.contigs_of_interest} \
                             {params.queue_size} {params.write_buffer_size_per_thread} {params.max_contig_length} >> {log} 2>&1
        """


def input_for_anvi_merge(wildcards):
    '''
        Create dictionary as input for rule anvi_merge.
        The reason we need a function as an input is to allow the user
        to choose between an option of an "all against all" vs. "normal"
        modes. See the documentation to learn more about the difference
        between these modes.
    '''

    if M.get_param_value_from_config(['all_against_all']):
        # If the user specified 'all against all' in the configs file
        # the end product would be a merge of all samples per group
        profiles = expand(dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/PROFILE.db", sample=list(M.samples_information['sample']), group=wildcards.group)

    else:
        # The default behaviour is to only merge (and hence map and profile)
        # together samples that belong to the same group.
        profiles = expand(dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/PROFILE.db", sample=list(M.samples_information[M.samples_information['group'] == wildcards.group]['sample']), group=wildcards.group)

    return profiles


def configure_flag_files_for_anvi_merge_optional_inputs(wildcards, flag_file_name, run_flag):
    '''
        this function creates the input to anvi_merge rule with
        regards to percent_of_reads_mapped_imported_flag_input.
    '''
    if M.get_param_value_from_config(['all_against_all']):
        flag_files = expand(os.path.join(dirs_dict["PROFILE_DIR"], "{group}/{sample}", flag_file_name), sample=list(M.samples_information['sample']), group=wildcards.group)

    else:
        flag_files = expand(os.path.join(dirs_dict["PROFILE_DIR"], "{group}/{sample}", flag_file_name), sample=list(M.samples_information[M.samples_information['group'] == wildcards.group]['sample']), group=wildcards.group)

    if not run_flag:
        flag_files = ancient(dirs_dict["CONTIGS_DIR"] + "/%s-contigs.db" % wildcards.group)

    return flag_files


def create_fake_output_files(_message, output):
    # creating "fake" output files with an informative message for
    # user.
    for o in output:
        with open(o, 'w') as f:
            f.write(_message + '\n')


def remove_empty_profile_databases(profiles, group):
    '''remove profiles that recruited zero reads from the metagenome.'''

    empty_profiles = []
    progress.new("Checking for empty profile databases")
    for p in profiles:
        keys, data = TableForLayerAdditionalData(argparse.Namespace(profile_db=p)).get(['total_reads_mapped'])

        if not next(iter(data.values()))['total_reads_mapped']:
            # this profile is empty so we can't include it in the merged profile.
            empty_profiles.append(p)

    profiles = list(set(profiles) - set(empty_profiles))
    progress.end()

    if not profiles:
        # if there are no profiles to merge then notify the user
        run.warning('It seems that all your profiles are empty for the '
                    'contigs database: %s.db. And so cannot be merged.' \
                     % group)

    run.info('Number of non-empty profile databases', len(profiles))
    run.info('Number of empty profile databases', len(empty_profiles))
    if len(empty_profiles) > 0:
        run.info('The following databases are empty: ', empty_profiles)

    return profiles


rule gen_readme_file_for_unmerged_groups:
    version: 1.0
    log: os.path.join(dirs_dict["LOGS_DIR"], "{group}-gen_readme_file_for_unmerged_groups.log")
    input:
        contigs = ancient(M.get_contigs_db_path()),
        profiles = input_for_anvi_merge,
        percent_of_reads_mapped_imported_flag = lambda wildcards: configure_flag_files_for_anvi_merge_optional_inputs(wildcards, 'import_percent_of_reads_mapped.done', run_import_percent_of_reads_mapped),
        kraken_flag = lambda wildcards: configure_flag_files_for_anvi_merge_optional_inputs(wildcards, "import_krakenuniq_taxonomy.done", M.run_krakenuniq)
    output: os.path.join(M.dirs_dict["MERGE_DIR"], "{group}", "README.txt")
    params:
    threads: w.T(config, 'gen_readme_file_for_unmerged_groups', 1)
    resources: nodes = w.T(config, 'gen_readme_file_for_unmerged_groups', 1)
    shell:
        """
            echo -e 'The group {wildcards.group} has only one sample. Hence, there is nothing to merge, but you can find\n\
                     the profile database here: {input.profiles}. Also, just so you know, profile was done using\n\
                     --cluster-contigs so you can visualize this profile database using anvi-interactive.' > {output} 2>>{log}
        """


rule anvi_merge:
    '''
        Run create a merged profile database.

        If there are multiple profiles mapped to the same contigs database,
        then merges these profiles. For individual profile, creates a symlink
        to the profile database. The purpose is to have one folder in
        which for every contigs database there is a profile database (or
        a symlink to a profile database) that could be used together for
        anvi-interactive.
    '''
    version: anvio.__profile__version__
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_merge.log"
    # The input are all profile databases that belong to the same group
    input:
        # marking the contigs.db as ancient in order to ignore timestamps.
        contigs = ancient(M.get_contigs_db_path()),
        profiles = input_for_anvi_merge,
        percent_of_reads_mapped_imported_flag = lambda wildcards: configure_flag_files_for_anvi_merge_optional_inputs(wildcards, 'import_percent_of_reads_mapped.done', run_import_percent_of_reads_mapped),
        kraken_flag = lambda wildcards: configure_flag_files_for_anvi_merge_optional_inputs(wildcards, "import_krakenuniq_taxonomy.done", M.run_krakenuniq)
    output:
        profile = dirs_dict["MERGE_DIR"] + "/{group}/PROFILE.db",
        AUXILIARY_DATA = dirs_dict["MERGE_DIR"] + "/{group}/AUXILIARY-DATA.db",
        runlog = dirs_dict["MERGE_DIR"] + "/{group}/RUNLOG.txt"
    threads: M.T('anvi_merge')
    resources: nodes = M.T('anvi_merge'),
    params:
        output_dir = dirs_dict["MERGE_DIR"] + "/{group}",
        name = "{group}",
        profile_dir = dirs_dict["PROFILE_DIR"] + "/{group}",
        sample_name = M.get_rule_param("anvi_merge", "--sample-name"),
        description = M.get_rule_param("anvi_merge", "--description"),
        skip_hierarchical_clustering = M.get_rule_param("anvi_merge", "--skip-hierarchical-clustering"),
        enforce_hierarchical_clustering = M.get_rule_param("anvi_merge", "--enforce-hierarchical-clustering"),
        distance = M.get_rule_param("anvi_merge", "--distance"),
        linkage = M.get_rule_param("anvi_merge", "--linkage"),
        overwrite_output_destinations = M.get_rule_param("anvi_merge", "--overwrite-output-destinations"),
    run:
        # using run instead of shell so we can choose the appropriate shell command.
        # In accordance with: https://bitbucket.org/snakemake/snakemake/issues/37/add-complex-conditional-file-dependency#comment-29348196

        # remove empty profile databases
        input.profiles = remove_empty_profile_databases(input.profiles, wildcards.group)

        if not input.profiles:
            # there are no profiles to merge.
            # this should only happen if all profiles were empty.
            _message = "Nothing to merge for %s. This should " \
                       "only happen if all profiles were empty " \
                       "(you can check the log file: {log} to see " \
                       "if that is indeed the case). " \
                       "This file was created just so that your workflow " \
                       "would continue with no error (snakemake expects " \
                       "to find these output files and if we don't create " \
                       "them, then it will be upset). As we see it, " \
                       "there is no reason to throw an error here, since " \
                       "you mapped your metagenome to some fasta files " \
                       "and you got your answer: whatever you have in " \
                       "your fasta file is not represented in your  " \
                       "metagenomes. Feel free to contact us if you think " \
                       "that this is our fault. sincerely, Meren Lab" \
                       % wildcards.group
            # creating the expected output files for the rule
            create_fake_output_files(_message, output)
            shell("echo %s >> {log}" % _message)

        elif M.group_sizes[wildcards.group] == 1:
            # for individual assemblies, create a symlink to the profile database
            #shell("ln -s {params.profile_dir}/*/* -t {params.output_dir} >> {log} 2>&1")
            #shell("touch -h {params.profile_dir}/*/*")

            # Still waiting to get an answer on this issue:
            # https://groups.google.com/d/msg/snakemake/zU_wkfZ7YCs/GZP0Z_RoAgAJ
            # Until then, I will just create fake file so snakemake is happy
            _message = "Only one file was profiled with %s so there \
                       is nothing to merge. But dont worry, you can \
                       still use anvi-interacite with the single profile \
                       database that is here: %s" \
                       % (wildcards.group, input.profiles[0])
            create_fake_output_files(_message, output)
            shell("echo %s >> {log}" % _message)

        elif len(input.profiles) == 1:
            # if only one sample is not empty, but the group size was
            # bigger than 1 then it means that --cluster-contigs was
            # not performed during anvi-profile.
            _message = "Only one sample had reads recruited to %s " \
                       "and hence merging could not occur." \
                       % wildcards.group
            create_fake_output_files(_message, output)
            shell("echo %s >> {log}" % _message)

        else:
            shell("anvi-merge {input.profiles} -o {params.output_dir} -c {input.contigs} \
                   {params.sample_name} \
                   {params.overwrite_output_destinations} {params.description} \
                   {params.skip_hierarchical_clustering} {params.enforce_hierarchical_clustering} \
                   {params.distance} {params.linkage} >> {log} 2>&1")


rule anvi_import_collection:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_import_collection.log"
    input:
        profile = lambda wildcards: M.profile_databases[wildcards.group],
        contigs = ancient(M.get_contigs_db_path()),
        collection_file = lambda wildcards: M.collections[wildcards.group]['collection_file']
    output:
        imported_collection_flag = touch(os.path.join(dirs_dict["MERGE_DIR"], \
                                              '{group}', \
                                              'collection-import.done'))
    params:
        bins_info = lambda wildcards: M.collections[wildcards.group]['bins_info'],
        collection_name = lambda wildcards: M.collections[wildcards.group]['collection_name'],
        contigs_mode = lambda wildcards: M.collections[wildcards.group]['contigs_mode']
    threads: 1
    resources: nodes=1
    shell:
        """
           anvi-import-collection -p {input.profile} \
                                  -c {input.contigs} \
                                  -C {params.collection_name} \
                                  {params.bins_info} \
                                  {params.contigs_mode} \
                                  {input.collection_file} >> {log} 2>&1
        """


rule anvi_import_default_collection:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_import_default_collection.log"
    input:
        profile = lambda wildcards: M.profile_databases[wildcards.group],
        contigs = ancient(M.get_contigs_db_path()),
    output:
        imported_collection_flag = touch(os.path.join(dirs_dict["MERGE_DIR"], \
                                              '{group}', \
                                              'default-collection-import.done'))
    threads: 1
    resources: nodes=1
    shell:
        """
           anvi-script-add-default-collection -p {input.profile} \
                                              -b {wildcards.group} \
                                              -c {input.contigs} >> {log} 2>&1
        """


rule anvi_summarize:
    version: 1.0
    log: os.path.join(dirs_dict["LOGS_DIR"], "{group}-anvi_summarize.log")
    input:
        profile = lambda wildcards: M.profile_databases[wildcards.group],
        contigs = ancient(M.get_contigs_db_path()),
        imported_collection_flag = lambda wildcards: M.get_collection_import_flag(str(wildcards.group))
    output:
        summary_done = temp(touch(os.path.join(dirs_dict["SUMMARY_DIR"], "{group}-summary.touch")))
    params:
        collection_name = lambda wildcards: M.collections[wildcards.group]['collection_name'],
        summary_temp_dir = os.path.join(dirs_dict["SUMMARY_DIR"], "{group}-TEMP"),
        additional_params = M.get_param_value_from_config(["anvi_summarize", "additional_params"])
    threads: M.T('anvi_summarize')
    resources: nodes = M.T('anvi_summarize')
    shell:
        """
            anvi-summarize -p {input.profile} \
                           -c {input.contigs} \
                           -C {params.collection_name} \
                           {params.additional_params} \
                           -o {params.summary_temp_dir} >> {log} 2>&1
        """


rule anvi_split:
    version: 1.0
    log: os.path.join(dirs_dict["LOGS_DIR"], "{group}-anvi_split.log")
    input:
        profile = lambda wildcards: M.profile_databases[wildcards.group],
        contigs = ancient(M.get_contigs_db_path()),
        imported_collection_flag = os.path.join(dirs_dict["MERGE_DIR"], \
                                                '{group}', \
                                                'collection-import.done')
    output:
        split_done = touch(os.path.join(dirs_dict["SPLIT_PROFILES_DIR"], "{group}-split.done"))
    params:
        collection_name = lambda wildcards: M.collections[wildcards.group]['collection_name'],
        split_dir = os.path.join(dirs_dict["SPLIT_PROFILES_DIR"], "{group}"),
        additional_params = M.get_param_value_from_config(["anvi_summarize", "additional_params"])
    threads: M.T('anvi_split')
    resources: nodes = M.T('anvi_split')
    shell:
        """
            anvi-split -p {input.profile} \
                       -c {input.contigs} \
                       -C {params.collection_name} \
                       -o {params.split_dir} >> {log} 2>&1
        """


rule dummy_rule_for_anvi_summarize:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-dummy_rule_for_anvi_summarize.log"
    input:
        summary_done = os.path.join(dirs_dict["SUMMARY_DIR"], "{group}-summary.touch")
    output:
        directory(os.path.join(dirs_dict["SUMMARY_DIR"], "{group}-SUMMARY"))
    params:
        summary_temp_dir = os.path.join(dirs_dict["SUMMARY_DIR"], "{group}-TEMP")
    threads: 1
    resources: nodes = 1
    shell: "mv {params.summary_temp_dir} {output} >> {log} 2>&1"


rule krakenuniq:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{sample}-krakenuniq.log"
    input: unpack(lambda wildcards: get_fastq(wildcards.sample))
    output:
        raw = os.path.join(dirs_dict["TAXONOMY_DIR"], "{sample}.krakenuniq"),
        tsv = os.path.join(dirs_dict["TAXONOMY_DIR"], "{sample}-krakenuniq-report.tsv")
    params:
        db = M.get_rule_param("krakenuniq", "--db"),
        gzip_compressed = M.get_rule_param("krakenuniq", "--gzip-compressed"),
        additional_params = M.get_param_value_from_config(["krakenuniq", "additional_params"])
    threads: M.T('krakenuniq')
    resources: nodes = M.T('krakenuniq')
    shell:
        """
            krakenuniq \
                     {params.db} \
                     --threads {threads} \
                     --fastq-input \
                     {params.gzip_compressed} \
                     --paired \
                     --output {output.raw} \
                     --report-file {output.tsv} \
                     {input.r1} \
                     {input.r2} \
                     {params.additional_params} >> {log} 2>&1
        """

rule krakenuniq_mpa_report:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{sample}-krakenuniq_mpa_report.log"
    input: os.path.join(dirs_dict["TAXONOMY_DIR"], "{sample}.krakenuniq"),
    output: os.path.join(dirs_dict["TAXONOMY_DIR"], "{sample}-krakenuniq.tsv")
    # FIXME: I need to see if more params should be available for krakenuniq_mpa_report
    params: db = M.get_rule_param("krakenuniq", "--db")
    threads: M.T('krakenuniq_mpa_report')
    resources: nodes = M.T('krakenuniq_mpa_report')
    shell:
        """
            krakenuniq-mpa-report {params.db} \
                                  --header-line \
                                  {input} \
                                  > {output} 2>> {log}
        """


def input_for_import_krakenuniq_taxonomy(wildcards):
    ''' configure input for import_percent_of_reads_mapped.'''
    D = {}
    if M.kraken_annotation_dict:
        # if the user supplied a file with kraken annotations then use it
        D['tsv'] = M.kraken_annotation_dict[wildcards.sample]['path']
    else:
        # if the user didn't provide a file, then annotation needs to run
        D['tsv'] = os.path.join(dirs_dict["TAXONOMY_DIR"], wildcards.sample + "-krakenuniq.tsv")

    if run_import_percent_of_reads_mapped:
        # we include percent_of_reads_mapped_imported_flag as input here so that kraken and import_percent_of_reads_mapped will never run in parallel.
        # If they run in parallel it could cause trouble as they will try to write to the profile database at the same time. very bad.
        D['percent_of_reads_mapped_imported_flag'] = os.path.join(dirs_dict["PROFILE_DIR"], wildcards.group, wildcards.sample, "import_percent_of_reads_mapped.done"),
    return D


rule import_krakenuniq_taxonomy:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-{sample}-import_krakenuniq_taxonomy.log"
    input:
        unpack(input_for_import_krakenuniq_taxonomy),
        profile = ancient(os.path.join(dirs_dict["PROFILE_DIR"], "{group}", "{sample}", "PROFILE.db"))
    output: touch(os.path.join(dirs_dict["PROFILE_DIR"], "{group}/{sample}/import_krakenuniq_taxonomy.done"))
    params: min_abundance = M.get_rule_param('import_krakenuniq_taxonomy', '--min-abundance')
    threads: M.T('import_krakenuniq_taxonomy')
    resources: nodes = M.T('import_krakenuniq_taxonomy')
    shell:
        """
            anvi-import-taxonomy-for-layers -p {input.profile} \
                                            -i {input.tsv} \
                                            --parser krakenuniq \
                                            {params.min_abundance} >> {log} 2>&1
        """


rule import_percent_of_reads_mapped:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-{sample}-import_percent_of_reads_mapped.log"
    input:
        profile = ancient(os.path.join(dirs_dict["PROFILE_DIR"], "{group}", "{sample}", "PROFILE.db"))
    output:
        flag = touch(dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/import_percent_of_reads_mapped.done"),
        total_num_reads_txt = temp(dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/total-num-reads.txt"),
        layers_additional_data = temp(dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/layers-additional-data.txt"),
        layers_additional_data_updated = temp(dirs_dict["PROFILE_DIR"] + "/{group}/{sample}/layers-additional-data-updated.txt")
    params:
        bowtie_log = dirs_dict["LOGS_DIR"] + "/{group}-{sample}-bowtie.log"
    threads: M.T('import_percent_of_reads_mapped')
    resources: nodes = M.T('import_percent_of_reads_mapped')
    run:
        if not filesnpaths.is_file_exists(params.bowtie_log, dont_raise=True):
            raise ConfigError('We expected to find the log file for bowtie2 here: %s '
                        'but it is not there. This means we can\'t find out what '
                        'percentage of short reads mapped to the fasta file. '
                        'If you are confused, feel free to contact us. '
                        'To avoid this error you can skip this step by setting the "run" parameter for this rule '
                        '(import_percent_of_reads_mapped) to "false" in your config file.' % params.bowtie_log)

        else:
            shell("""
                    echo -e "sample\ttotal_num_reads" > {output.total_num_reads_txt}
                    # ok so this whole thing is a terrible hack anyway, so the next two lines
                    # shouldn't surprise you. Basically printing one line with the sample name
                    # and the the number of reads (which is grepped from the log file)
                    echo -e "`echo -n "{params.bowtie_log}" | awk 'BEGIN{{FS="-"}}{{printf("%s\t", $2)}}';\
                            grep 'reads; of these:' {params.bowtie_log} | awk '{{print $1 * 2}}'`" >> {output.total_num_reads_txt}

                    # import this into the profile database. this part is simply
                    # a 'hack' so we can get a file with both 'mapper reads' and
                    # 'total reads' columns in the next step.
                    anvi-import-misc-data {output.total_num_reads_txt} \
                                          -p {input.profile} \
                                          -t layers \
                                          --just-do-it >> {log} 2>&1

                    # get the layers additional data table, which has two columns
                    # now.
                    anvi-export-misc-data -p {input.profile} \
                                          -t layers \
                                          -o {output.layers_additional_data} >> {log} 2>&1
                  """)

            additional_data = pd.read_csv(output.layers_additional_data, sep='\t', index_col=None)
            if 'total_reads_mapped' not in additional_data.columns:
                raise ConfigError('The layers additional data for %s doesn\'t contain a column '
                                  '"total_reads_mapped". This is really weird, and we cannot '
                                  'think of why this happened (maybe you deleted it somehow?). '
                                  'Feel free to contact an anvio developer in order to troubleshoot '
                                  'this. Otherwise, if you wish to continue, you can just skip this '
                                  'step by setting the "run" parameter for the rule import_percent_of_reads_mapped '
                                  'to "false".' % input.profile)

            additional_data['percent_mapped'] = 100 * additional_data['total_reads_mapped'] / additional_data['total_num_reads']
            additional_data.to_csv(output.layers_additional_data_updated, sep='\t', index=False)

            shell("""
                    # import the new file with three columns
                    anvi-import-misc-data {output.layers_additional_data_updated} \
                                          -p {input.profile} \
                                          -t layers \
                                          --just-do-it >> {log} 2>&1
                  """)


rule anvi_cluster_contigs:
    version: 1.0
    log: os.path.join(dirs_dict["LOGS_DIR"], "{group}-{driver}-anvi_cluster_contigs.log")
    threads: M.T("anvi_cluster_contigs")
    resources: nodes = M.T("anvi_cluster_contigs")
    input:
        contigs = ancient(M.get_contigs_db_path()),
        profile = lambda wildcards: M.profile_databases[wildcards.group]
    params:
        driver = '{driver}',
        collection_name = M.get_rule_param('anvi_cluster_contigs', '--collection-name'),
        just_do_it = M.get_rule_param('anvi_cluster_contigs', '--just-do-it'),
        additional_params = lambda wildcards: M.get_param_value_from_config(['anvi_cluster_contigs', M.get_param_name_for_binning_driver(wildcards.driver)])
    output: touch(os.path.join(dirs_dict['MERGE_DIR'], '{group}' + '-' + '{driver}' + '.done'))
    shell: w.r("""anvi-cluster-contigs -c {input.contigs} \
                                       -p {input.profile} \
                                       --driver {params.driver} \
                                       {params.collection_name} \
                                       {params.just_do_it} \
                                       {params.additional_params} >> {log} 2>&1""")


if 'workflows/metagenomics' in workflow.included[0]:
    # check if all program dependencies are met. for this line to be effective,
    # there should be an initial dry run step (which is the default behavior of
    # the `WorkflowSuperClass`, so you are most likely covered).
    M.check_workflow_program_dependencies(workflow, dont_raise=True)
