# -*- coding: utf-8

import os
import anvio
import argparse
import pandas as pd
import anvio.workflows as w
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError
from anvio.workflows.pangenomics import PangenomicsWorkflow

__author__ = "Alon Shaiber"
__copyright__ = "Copyright 2017, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "Alon Shaiber"
__email__ = "alon.shaiber@gmail.com"

run = terminal.Run()

slave_mode= False if 'workflows/pangenomics' in workflow.included[0] else True

if not slave_mode:
    M = PangenomicsWorkflow(argparse.Namespace(config=config))
    M.init()
    dirs_dict = M.dirs_dict

# in order to generate the contigs databases we include the snakefile for the generation of contigs databases
# include statement must come after we get an instance of PangenomicsWorkflow class
include: w.get_workflow_snake_file_path('contigs')

project_name = M.get_param_value_from_config("project_name")
if not project_name:
    raise ConfigError("You must provide a project name in your config file.")
pan_project_name = M.get_param_value_from_config(["anvi_pan_genome", "--project-name"])
if pan_project_name:
    run.warning('you chose to set the "--project-name" parameter for "anvi_pan_genome". That is ok\
                 but just so you know, if you haven\'t supplied this, then we would have taken the value\
                 from "project_name" in your config file to also be the project name for "anvi_pan_genome"')
else:
    pan_project_name = project_name

# the default name for the input fasta.txt file is "fasta.txt"
fasta_txt = M.get_param_value_from_config("fasta_txt")
filesnpaths.is_file_exists(fasta_txt)
fasta_information = pd.read_csv(fasta_txt, sep='\t', index_col=0).to_dict(orient='index')
ALL_SAMPLES = list(fasta_information.keys())

internal_genomes_file = M.get_param_value_from_config(["anvi_gen_genomes_storage", "--internal-genomes"])
external_genomes_file = M.get_param_value_from_config(["anvi_gen_genomes_storage", "--external-genomes"])

if (not internal_genomes_file) and (not external_genomes_file):
    raise ConfigError("You must provide a path to either internal_genomes_file or external_genomes_file\
                       or both.")

# run pangenome
rule anvi_pan_genome:
    version: anvio.__pan__version__
    log: dirs_dict["LOGS_DIR"] + "/anvi_pan_genome.log"
    threads: M.T("anvi_pan_genome")
    resources: nodes = M.T("anvi_pan_genome")
    input: dirs_dict["PAN_DIR"] + "/" + project_name + "-GENOMES.db"
    params:
        output_dir = dirs_dict["PAN_DIR"],
        genome_names = M.get_rule_param("anvi_pan_genome", "--genome-names"),
        project_name = pan_project_name,
        skip_alignments = M.get_rule_param("anvi_pan_genome", "--skip-alignments"),
        align_with = M.get_rule_param("anvi_pan_genome", "--align-with"),
        exclude_partial_gene_calls = M.get_rule_param("anvi_pan_genome", "--exclude-partial-gene-calls"),
        use_ncbi_blast = M.get_rule_param("anvi_pan_genome", "--use-ncbi-blast"),
        minbit = M.get_rule_param("anvi_pan_genome", "--minbit"),
        mcl_inflation = M.get_rule_param("anvi_pan_genome", "--mcl-inflation"),
        min_occurrence = M.get_rule_param("anvi_pan_genome", "--min-occurrence"),
        min_percent_identity = M.get_rule_param("anvi_pan_genome", "--min-percent-identity"),
        sensitive = M.get_rule_param("anvi_pan_genome", "--sensitive"),
        description = M.get_rule_param("anvi_pan_genome", "--description"),
        overwrite_output_destinations = M.get_rule_param("anvi_pan_genome", "--overwrite-output-destinations"),
        skip_hierarchical_clustering = M.get_rule_param("anvi_pan_genome", "--skip-hierarchical-clustering"),
        enforce_hierarchical_clustering = M.get_rule_param("anvi_pan_genome", "--enforce-hierarchical-clustering"),
        distance = M.get_rule_param("anvi_pan_genome", "--distance"),
        linkage = M.get_rule_param("anvi_pan_genome", "--linkage")
    output: dirs_dict["PAN_DIR"] + "/" + pan_project_name + "-PAN.db"
    shell:
        """
            anvi-pan-genome -g {input} --num-threads {threads} -o {params.output_dir} --project-name {params.project_name} {params.genome_names}\
            {params.skip_alignments} {params.align_with} {params.exclude_partial_gene_calls}\
            {params.use_ncbi_blast} {params.minbit} {params.mcl_inflation}\
            {params.min_occurrence} {params.min_percent_identity} {params.sensitive}\
            {params.description} {params.overwrite_output_destinations}\
            {params.skip_hierarchical_clustering} {params.enforce_hierarchical_clustering}\
            {params.distance} {params.linkage}
        """



# generate external genomes storage
rule gen_external_genome_file:
    version: 1.0
    log: dirs_dict["LOGS_DIR"] + "" + "/gen_external_genome_file.log"
    input:
        annotation_done = expand(dirs_dict['CONTIGS_DIR'] + "/{sample}-annotate_contigs_database.done", sample=ALL_SAMPLES),
        contigs_dbs = expand(dirs_dict["CONTIGS_DIR"] + "/{sample}-contigs.db", sample=ALL_SAMPLES)
    output: external_genomes_file
    threads: M.T("gen_external_genome_file")
    resources: nodes = M.T("gen_external_genome_file")
    run:
        with open(output[0], 'w') as f:
            f.write("name\tcontigs_db_path\n")
            for c in input.contigs_dbs:
                f.write("%s\t%s\n" % (c.split(".")[0].split("/")[-1].replace('-','_'), c))

def input_for_anvi_gen_genomes_storage(wildcards):
    # make sense of config file to know if user is using internal genomes
    # or external genomes or both
    d = {}

    # ok, so pay attention, because the next few lines might seem like a mistake
    # but actually this is a trick. Since we know at least one of the genomes file
    # must be provided then we can set each file to point to the other one,
    # and that way whichever file was not provided would just point to the other file
    # and thus the rule would be happy (confused? then ask Alon Shaiber)
    d["internal_genomes_file"] = external_genomes_file
    d["external_genomes_file"] = internal_genomes_file
    if internal_genomes_file:
        d["internal_genomes_file"] = internal_genomes_file
    if external_genomes_file:
        d["external_genomes_file"] = external_genomes_file
    return d


# generate anvi'o genomes storage
rule anvi_gen_genomes_storage:
    log: dirs_dict["LOGS_DIR"] + "/anvi_gen_genomes_storage.log"
    threads: M.T("anvi_gen_genomes_storage")
    resources: nodes = M.T("anvi_gen_genomes_storage")
    input: unpack(input_for_anvi_gen_genomes_storage)
    output: dirs_dict["PAN_DIR"] + "/" + project_name + "-GENOMES.db"
    params:
        internal_genomes_argument = lambda wildcards: "--internal-genomes " + internal_genomes_file if internal_genomes_file else "",
        external_genomes_argument = lambda wildcards: "--external-genomes " + external_genomes_file if external_genomes_file else "",
        gene_caller = M.get_rule_param("anvi_gen_genomes_storage", "--gene-caller"),
    shell:
        """
            anvi-gen-genomes-storage -o {output}\
                                     {params.internal_genomes_argument}\
                                     {params.external_genomes_argument}\
                                     {params.gene_caller}
        """

if not slave_mode:
    M.check_workflow_program_dependencies(workflow)
