# -*- coding: utf-8

import argparse
import pandas as pd

import anvio
import anvio.utils as u
import anvio.workflows as w
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.workflows.contigs import ContigsDBWorkflow

__author__ = "Alon Shaiber"
__copyright__ = "Copyright 2017, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "Alon Shaiber"
__email__ = "alon.shaiber@gmail.com"

run = terminal.Run()

slave_mode = False if 'workflows/contigs' in workflow.included[0] else True
if not slave_mode:
    # don't be confused, child. when things come to this point, the variable `config`
    # is already magically filled in by snakemake:
    M = ContigsDBWorkflow(argparse.Namespace(config=config, slave_mode=slave_mode))
    M.init()
    dirs_dict = M.dirs_dict

localrules: annotate_contigs_database

group_names = []
fasta_txt_file = M.get_param_value_from_config('fasta_txt', repress_default=True)

if fasta_txt_file:
    filesnpaths.is_file_exists(fasta_txt_file)
    fasta_information = u.get_TAB_delimited_file_as_dictionary(fasta_txt_file)
    group_names = list(fasta_information.keys())
    references_mode = True

# setting configuration for optional steps
run_remove_human_dna_using_centrifuge = M.get_param_value_from_config(["remove_human_dna_using_centrifuge", "run"]) == True

# default is NOT running taxonomy with centrifuge
run_taxonomy_with_centrifuge = M.get_param_value_from_config(["centrifuge", "run"]) == True

# default is running anvi_run_hmms
run_anvi_run_hmms = M.get_param_value_from_config(["anvi_run_hmms", "run"]) == True

# default is running anvi_run_ncbi_cogs
run_anvi_run_ncbi_cogs = M.get_param_value_from_config(["anvi_run_ncbi_cogs", "run"]) == True

# default is NOT running anvi-script-run-eggnog-mapper
run_anvi_script_run_eggnog_mapper = M.get_param_value_from_config(["anvi_script_run_eggnog_mapper", "run"]) == True

# sanity check for centrifuge db
if run_taxonomy_with_centrifuge:
    if not M.get_param_value_from_config(["centrifuge", "db"]):
        raise ConfigError("If you plan to run centrifuge, then you must "\
                          "provide a path for the centrifuge db in the "\

                          "config file. See documentation for more details.")

def get_raw_fasta(wildcards):
    '''
        Define the path to the input fasta files.

        Uses the config details to choose between the raw fasta file,
        the reformatted, and the output of the host contamination removal.
        This function also deals with the different cases of "reference mode"
        Vs. "assembly mode".
    '''
    if references_mode:
        # in 'reference mode' the input is the reference fasta
        contigs = fasta_information[wildcards.group]['path']
    else:
        # by default the input fasta is the assembly output
        contigs = dirs_dict["FASTA_DIR"] + "/%s/final.contigs.fa" % wildcards.group
    return contigs

rule generate_and_annotate_contigs_db:
    input: expand(dirs_dict['CONTIGS_DIR'] + "/{group}-annotate_contigs_database.done", group=group_names)


rule anvi_script_reformat_fasta_prefix_only:
    '''
        Reformating the headers of the contigs fasta files.

        This is required to make sure taht the headers don't contain
        any charachters that anvi'o doesn't like.It give contigs
        meaningful names; so that if the group name is 'MYSAMPLE01', the
        contigs would look like this:
        > MYSAMPLE01_000000000001
        > MYSAMPLE01_000000000002

        We do it in two steps in this workflow with the rules:
        anvi_script_reformat_fasta_prefix_only and anvi_script_reformat_fasta_min_length
        The first rule anvi_script_reformat_fasta_prefix_only does all the formating
        of headers without considering the min_len parameter. The second rule takes
        the output of the first rule as input and considers the min_len param.
        The reasoning is that this way we have a fasta file that includes all the contigs
        from the raw fasta file, but with headers that match the fasta final fasta file.
        Hence, if later we decide we want to include shorter contigs, we can do it using
        the output of anvi_script_reformat_fasta_prefix_only and the headers would be consistant.
    '''
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_script_reformat_fasta_prefix_only.log"
    input:
        contigs = get_raw_fasta
    output:
        # write protecting the contigs fasta file using protected() because
        # runnig the assembly is probably the most time consuming step and
        # we don't want anyone accidentaly deleting or changing this file.
        contigs = protected(dirs_dict["FASTA_DIR"] + "/{group}/{group}-contigs-prefix-formatted-only.fa"),
        report = dirs_dict["FASTA_DIR"] + "/{group}/{group}-reformat-report.txt"
    params:
        prefix = "{group}",
        simplify_names = M.get_rule_param("anvi_script_reformat_fasta", "--simplify-names"),
        keep_ids = M.get_rule_param("anvi_script_reformat_fasta", "--keep-ids"),
        exclude_ids = M.get_rule_param("anvi_script_reformat_fasta", "--exclude-ids"),
    threads: M.T('anvi_script_reformat_fasta')
    resources: nodes = M.T('anvi_script_reformat_fasta'),
    shell: w.r("""anvi-script-reformat-fasta {input} \
                                             -o {output.contigs} \
                                             -r {output.report} \
                                             --prefix {params.prefix} \
                                             {params.exclude_ids} \
                                             {params.keep_ids} \
                                             {params.simplify_names} >> {log} 2>&1""")


rule anvi_script_reformat_fasta:
    '''
        See docummentation for anvi_script_reformat_fasta_prefix_only
    '''
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_script_reformat_fasta.log"
    input:
        contigs = dirs_dict["FASTA_DIR"] + "/{group}/{group}-contigs-prefix-formatted-only.fa"
    output:
        contigs = dirs_dict["FASTA_DIR"] + "/{group}/{group}-contigs.fa"
    params:
        prefix = "{group}",
        min_len = M.get_rule_param("anvi_script_reformat_fasta", "--min-len"),
    threads: M.T('anvi_script_reformat_fasta')
    resources: nodes = M.T('anvi_script_reformat_fasta'),
    shell: w.r("""anvi-script-reformat-fasta {input} \
                                             -o {output.contigs} \
                                             {params.min_len} >> {log} 2>&1""")


def input_for_run_remove_human_dna_using_centrifuge(wildcards):
    ''' input fasta for the rule run_remove_human_dna_using_centrifuge'''
    # The raw fasta will be used if no formatting is needed
    contigs = get_raw_fasta(wildcards)

    if M.get_param_value_from_config(['anvi_script_reformat_fasta','run']):
        # by default, reformat fasta is ran
        contigs = dirs_dict["FASTA_DIR"] + "/{group}/{group}-contigs.fa".format(group=wildcards.group)

    return contigs


if run_remove_human_dna_using_centrifuge:

    # These rules will only run if the user asked for removal of Human contamination
    rule remove_human_dna_using_centrifuge:
        """ this is just a placeholder for now """
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-remove-human-dna-using-centrifuge.log"
        input: input_for_run_remove_human_dna_using_centrifuge
        output: contigs = dirs_dict["FASTA_DIR"] + "/{group}/{group}-contigs-filtered.fa"
        threads: M.T('remove_human_dna_using_centrifuge')
        resources: nodes = M.T('remove_human_dna_using_centrifuge'),
        shell: "touch {output} >> {log} 2>&1"


def get_fasta(wildcards):
    '''
        Define the path to the input fasta files.

        The input hierarchy is as follows:
            output of remove_human_dna_using_centrifuge
            output of anvi_script_reformat_fasta
            raw fasta
        Meaning that if host contamination removal was done then it's
        output is the input fasta, elif anvi_script_reformat_fasta was ran then it's
        output will be used, otherwise the raw fasta will be used.
    '''
    # By default (if no host contamination removal is requested)
    # Then the input will be the same as the one that would have been used
    # by the rule run_remove_human_dna_using_centrifuge
    contigs = input_for_run_remove_human_dna_using_centrifuge(wildcards)

    if run_remove_human_dna_using_centrifuge:
        # if host contamination removal is used, then use it's output
        contigs = rules.remove_human_dna_using_centrifuge.output.contigs

    return contigs

contigs_project_name = M.get_param_value_from_config(['anvi_gen_contigs_database', '--project-name'], repress_default=True)
if contigs_project_name != M.default_config['anvi_gen_contigs_database']['--project-name'] and contigs_project_name is not None:
    run.warning('You chose to set the "project_name" for your contigs databases\
                 in the config file to %s. You are welcomed to do that, but at your own\
                 risk. Just so you know, by default the project name would match\
                 the name for each contigs file (as defined either in the samples_txt\
                 or fasta_txt file that you supplied), by choosing to provide\
                 a different name, it means that all your contigs databases would have\
                 the same name, unless you incloded "{group}" in the name you provided\
                 but even then, we did not test that option and we are not sure it would\
                 work...' % contigs_project_name)

rule anvi_gen_contigs_database:
    """ Generates a contigs database using anvi-gen-contigs-database"""
    # Setting the version to the same as that of the contigs__version in anvi'o
    version: anvio.__contigs__version__
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_gen_contigs_database.log"
    # depending on whether human contamination using centrifuge was done
    # or not, the input to this rule will be the raw assembly or the
    # filtered.
    input: get_fasta
    output:
        db = dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db"
    params:
        description = M.get_rule_param("anvi_gen_contigs_database", "--description"),
        skip_gene_calling = M.get_rule_param("anvi_gen_contigs_database", "--skip-gene-calling"),
        external_gene_calls = M.get_rule_param("anvi_gen_contigs_database", "--external-gene-calls"),
        ignore_internal_stop_codons = M.get_rule_param("anvi_gen_contigs_database", "--ignore-internal-stop-codons"),
        skip_mindful_splitting = M.get_rule_param("anvi_gen_contigs_database", "--skip-mindful-splitting"),
        contigs_fasta = M.get_rule_param("anvi_gen_contigs_database", "--contigs-fasta"),
        project_name = M.get_rule_param("anvi_gen_contigs_database", "--project-name"),
        split_length = M.get_rule_param("anvi_gen_contigs_database", "--split-length"),
        kmer_size = M.get_rule_param("anvi_gen_contigs_database", "--kmer-size"),
    threads: M.T('anvi_gen_contigs_database')
    resources: nodes = M.T('anvi_gen_contigs_database'),
    shell: w.r("anvi-gen-contigs-database -f {input} \
                                          -o {output.db} \
                                          {params.ignore_internal_stop_codons} \
                                          {params.external_gene_calls} \
                                          {params.skip_gene_calling} \
                                          {params.skip_mindful_splitting} \
                                          {params.kmer_size} \
                                          {params.split_length} \
                                          {params.description} \
                                          {params.project_name} \
                                          {params.contigs_fasta} \
                                          {params.skip_mindful_splitting} \
                                          {params.ignore_internal_stop_codons} \
                                          {params.external_gene_calls} \
                                          {params.skip_gene_calling} \
                                          {params.description} >> {log} 2>&1")

if run_taxonomy_with_centrifuge:
    # If the user wants taxonomy to be assigned with centrifuge
    # then these following rules would run.
    rule export_gene_calls_for_centrifuge:
        ''' Export gene calls and use for centrifuge'''
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-export_gene_calls_for_centrifuge.log"
        # marking the input as ancient in order to ignore timestamps.
        input: ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db")
        # output is temporary. No need to keep this file.
        output: temp(dirs_dict["CONTIGS_DIR"] + "/{group}-gene-calls.fa")
        threads: M.T('export_gene_calls_for_centrifuge')
        resources: nodes = M.T('export_gene_calls_for_centrifuge'),
        shell: "anvi-get-sequences-for-gene-calls -c {input} -o {output} >> {log} 2>&1"


    rule centrifuge:
        ''' Run centrifuge on the exported gene calls of the contigs.db'''
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-centrifuge.log"
        input: rules.export_gene_calls_for_centrifuge.output
        output:
            hits = dirs_dict["CONTIGS_DIR"] + "/{group}-centrifuge_hits.tsv",
            report = dirs_dict["CONTIGS_DIR"] + "/{group}-centrifuge_report.tsv"
        params: db=config["centrifuge"]['db']
        threads: M.T('centrifuge')
        resources: nodes = M.T('centrifuge'),
        shell: w.r("centrifuge -f \
                               -x {params.db} \
                               {input} \
                               -S {output.hits} \
                               --report-file {output.report} \
                               --threads {threads} >> {log} 2>&1")


    rule anvi_import_taxonomy:
        ''' Run anvi-import-taxonomy-for-genes'''
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_import_taxonomy.log"
        input:
            hits = rules.centrifuge.output.hits,
            report = rules.centrifuge.output.report,
            # marking the contigs.db as ancient in order to ignore timestamps.
            contigs = ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db")
        # using a flag file because no file is created by this rule.
        # for more information see:
        # http://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#flag-files
        output: touch(dirs_dict["CONTIGS_DIR"] + "/{group}-anvi_anvi_import_taxonomy.done")
        params: parser = "centrifuge"
        threads: M.T('anvi_import_taxonomy')
        resources: nodes = M.T('anvi_import_taxonomy'),
        shell: w.r("anvi-import-taxonomy-for-genes -c {input.contigs} \
                                                   -i {input.report} {input.hits} \
                                                   -p {params.parser} >> {log} 2>&1")


if run_anvi_run_hmms:
    rule anvi_run_hmms:
        """ Run anvi-run-hmms"""
        version: 1.0
        log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_run_hmms.log"
        # marking the input as ancient in order to ignore timestamps.
        input: ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db")
        # using a snakemake flag file as an output since no file is generated
        # by the rule.
        output: touch(dirs_dict["CONTIGS_DIR"] + "/anvi_run_hmms-{group}.done")
        params:
            installed_hmm_profile = M.get_rule_param("anvi_run_hmms", "--installed-hmm-profile"),
            hmm_profile_dir = M.get_rule_param("anvi_run_hmms", "--hmm-profile-dir"),
        threads: M.T('anvi_run_hmms')
        resources: nodes = M.T('anvi_run_hmms'),
        shell: w.r("anvi-run-hmms -c {input} \
                                  -T {threads} \
                                  {params.hmm_profile_dir} \
                                  {params.installed_hmm_profile} >> {log} 2>&1")


w.warning_for_param(config, 'anvi_run_ncbi_cogs', '--temporary-dir-path', '{group}')
rule anvi_run_ncbi_cogs:
    version: anvio.__contigs__version__
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_run_ncbi_cogs.log"
    input: ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db")
    output: touch(dirs_dict["CONTIGS_DIR"] + "/anvi_run_ncbi_cogs-{group}.done")
    params:
        # anvi-run-ncbi-cogs params. See anvi-run-ncbi-cogs help menu for more info.
        cog_data_dir = M.get_rule_param('anvi_run_ncbi_cogs', '--cog-data-dir'),
        sensitive = M.get_rule_param('anvi_run_ncbi_cogs', '--sensitive'),
        temporary_dir_path = M.get_rule_param('anvi_run_ncbi_cogs', '--temporary-dir-path'),
        search_with = M.get_rule_param('anvi_run_ncbi_cogs', '--search-with')
    threads: M.T('anvi_run_ncbi_cogs')
    resources: nodes = M.T('anvi_run_ncbi_cogs'),
    shell: w.r("""anvi-run-ncbi-cogs -c {input} \
                                     -T {threads} \
                                     {params.cog_data_dir} \
                                     {params.sensitive} \
                                     {params.temporary_dir_path} \
                                     {params.search_with} >> {log} 2>&1""")


rule anvi_get_sequences_for_gene_calls:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_get_sequences_for_gene_calls.log"
    input: ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db")
    output: temp(dirs_dict["CONTIGS_DIR"] + "/{group}-TM7_NR-contigs-aa-sequences.fa")
    threads: M.T('anvi_get_sequences_for_gene_calls')
    resources: nodes = M.T('anvi_get_sequences_for_gene_calls')
    shell: "anvi-get-sequences-for-gene-calls -c {input} -o {output} --get-aa-sequences >> {log} 2>&1"


rule emapper:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-emapper.log"
    input: rules.anvi_get_sequences_for_gene_calls.output
    output: temp(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.emapper.annotations")
    # TODO: add other emapper params
    params:
        contigs_path_without_extension = dirs_dict["CONTIGS_DIR"] + "/{group}-contigs",
        path_to_emapper_dir = M.get_param_value_from_config(['emapper', 'path_to_emapper_dir']),
        database = M.get_rule_param("emapper", "--database"),
        usemem = M.get_rule_param("emapper", "--usemem"),
        override = M.get_rule_param("emapper", "--override")
    threads: M.T('emapper')
    resources: nodes = M.T('emapper')
    run:
        # running emapper
        shell("python2.7 {params.path_to_emapper_dir}/emapper.py -i {input} --output {params.contigs_path_without_extension} " + \
                  "--cpu {threads} {params.database} {params.usemem} {params.override} >> {log} 2>&1")

        # Adding a 'g' prefix before every gene id (the anvi'o emapper driver requires this)
        shell("sed 's/^[0-9]/g&/' {output} > {output}.temp")

        # changing the file name to the final output
        shell("mv {output}.temp {output}")


rule anvi_script_run_eggnog_mapper:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-anvi_script_run_eggnog_mapper.log"
    input:
        eggnog_output = rules.emapper.output,
        contigs = dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db"
    output: touch(dirs_dict["CONTIGS_DIR"] + "/{group}-anvi_script_run_eggnog_mapper.done")
    params:
        use_version = M.get_rule_param("anvi_script_run_eggnog_mapper", "--use-version")
    threads: M.T('anvi_script_run_eggnog_mapper')
    resources: nodes = M.T('anvi_script_run_eggnog_mapper')
    shell: "anvi-script-run-eggnog-mapper -c {input.contigs} --annotation {input.eggnog_output} {params.use_version}  >> {log} 2>&1"


rule annotate_contigs_database:
    '''
        This is a dummy rule and it is here just to guarantee that all
        the contigs annotations will run (according to what was requested
        in the config file). The main use is to use --until annotate_contigs_database
        if you just want a contigs databases with all the annotations.
    '''
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "/{group}-annotate_contigs_database.log"
    input:
        # this is here just so snakemake would run the taxonomy before running this rule
        taxonomy = rules.anvi_import_taxonomy.output if run_taxonomy_with_centrifuge else ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db"),
        # this is here just so snakemake would run the hmms before running this rule
        hmms = rules.anvi_run_hmms.output if run_anvi_run_hmms else ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db"),
        # this is here just so snakemake would run the ncbi cogs before running this rule
        cogs = rules.anvi_run_ncbi_cogs.output if run_anvi_run_ncbi_cogs else ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db"),
        # I hope that by now, you understand why this is here (hint: it is in order to prevent global warming)
        eggnog_mapper = rules.anvi_script_run_eggnog_mapper.output if run_anvi_script_run_eggnog_mapper else ancient(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db")
    output: touch(dirs_dict['CONTIGS_DIR'] + "/{group}-annotate_contigs_database.done")
    shell: "touch {output} >> {log} 2>&1"


if not slave_mode:
    # check if all program dependencies are met. for this line to be effective,
    # there should be an initial dry run step (which is the default behavior of
    # the `WorkflowSuperClass`, so you are most likely covered).
    M.check_workflow_program_dependencies(workflow)
