#!/usr/bin/env python
# -*- coding: utf-8
"""A script to export a FASTA files from contigs databases and compute ani."""

import os
import sys
import shutil
import argparse

import anvio
import anvio.db as db
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.clustering as clustering
import anvio.filesnpaths as filesnpaths
import anvio.genomedescriptions as genomedescriptions

from anvio.drivers import pyani
from anvio.tables.miscdata import TableForLayerAdditionalData
from anvio.tables.miscdata import TableForLayerOrders

from anvio.errors import ConfigError, FilesNPathsError
import anvio.errors


__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "Ozcan Esen"
__email__ = "ozcanesen@gmail.com"

run = terminal.Run()

def restore_names_in_dict(input_dict, conversion_dict):
    """
    Takes dictionary that contains hashes as keys
    and replaces it back to genome names using conversion_dict.

    If value is dict, it calls itself.
    """
    new_dict = {}
    for key, value in input_dict.items():
        if isinstance(value, dict):
            value = restore_names_in_dict(value, conversion_dict)

        if key in conversion_dict:
            new_dict[conversion_dict[key]] = value
        else:
            new_dict[key] = value

    return new_dict


def main(args):
    program = pyani.PyANI(args)

    filesnpaths.check_output_directory(args.output_dir, ok_if_exists=False)

    genome_desc = genomedescriptions.GenomeDescriptions(args, run = terminal.Run(verbose=False))
    genome_desc.load_genomes_descriptions(skip_functions=True, init=False)

    temp_dir = filesnpaths.get_temp_directory_path()

    run.info("Genomes found", len(genome_desc.genomes))
    run.info("Temporary FASTA output directory", temp_dir)
    run.info("Output directory", os.path.abspath(args.output_dir))

    hash_to_name = {}
    genome_names = set([])
    for genome_name in genome_desc.genomes:
        genome_names.add(genome_name)
        db_path = genome_desc.genomes[genome_name]['contigs_db_path']
        db_hash = genome_desc.genomes[genome_name]['contigs_db_hash']

        hash_to_name[db_hash] = genome_name
        utils.export_sequences_from_contigs_db(db_path, os.path.join(temp_dir, db_hash + '.fa'))

    results = program.run_command(temp_dir)
    results = restore_names_in_dict(results, hash_to_name)

    clusterings = {}
    for report_name in results:
        clusterings[report_name] = clustering.get_newick_tree_data_for_dict(results[report_name],
            linkage=args.linkage, distance=args.distance)

    os.mkdir(args.output_dir)
    for report_name in results:
        output_path_for_report = os.path.join(args.output_dir, args.method + '_' + report_name)

        utils.store_dict_as_TAB_delimited_file(results[report_name], output_path_for_report + '.txt')
        with open(output_path_for_report + '.newick', 'w') as f:
            f.write(clusterings[report_name])

    if args.pan_db:
        utils.is_pan_db(args.pan_db)
        pan = db.DB(args.pan_db, anvio.__pan__version__)

        db_genome_names = set(pan.get_meta_value('external_genome_names').split(',') +
            pan.get_meta_value('internal_genome_names').split(','))

        found_only_in_db = db_genome_names.difference(genome_names)
        found_only_in_results = genome_names.difference(db_genome_names)

        if len(found_only_in_results) > 0:
            raise ConfigError("Some genome names found in ANI result, does not seem to be exists in the pan database. \
                Here are the list of them: " + ", ".join(list(found_only_in_results)))

        if len(found_only_in_db) > 0:
            run.warning("Some genome names found in pan database, does not seem to be exists in the ANI report. \
                anvi'o will add the ones that found in database anyway, but here is the list of missing ones: \
                " + ", ".join(list(found_only_in_db)))

        for report_name in results:
            target_data_group = 'ANI_%s' % (report_name)
            l_args = argparse.Namespace(pan_db=args.pan_db, just_do_it=True, target_data_group=target_data_group)
            TableForLayerAdditionalData(l_args).add(results[report_name], list(results[report_name].keys()))

            TableForLayerOrders(args).add({'ANI_' + report_name: {'data_type': 'newick',
                                                                  'data_value': clusterings[report_name]}})


    if anvio.DEBUG:
        run.warning("The temp directory, %s, is kept. Please don't forget to clean it up\
                     later" % temp_dir, header="Debug")
    else:
        run.info_single('Cleaning up the temp directory (you can use `--debug` if you would\
                         like to keep it for testing purposes)', nl_before=1, nl_after=1)
        shutil.rmtree(temp_dir)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Export sequences from external genomes and compute\
                                     ANI. If Pan Database is given anvi'o will write computed output\
                                     to misc data tables of Pan Database.")

    parser.add_argument(*anvio.A('internal-genomes'), **anvio.K('internal-genomes'))
    parser.add_argument(*anvio.A('external-genomes'), **anvio.K('external-genomes'))
    parser.add_argument(*anvio.A('output-dir'), **anvio.K('output-dir', {'required': True }))
    parser.add_argument(*anvio.A('pan-db'), **anvio.K('pan-db', {'required': False}))
    parser.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads'))
    parser.add_argument(*anvio.A('log-file'), **anvio.K('log-file'))
    parser.add_argument('--method', default='ANIb', type=str, help="Method for pyANI. The default is %(default)s.\
                         You must have the necessary binary in path for whichever method you choose. According to\
                         the pyANI help for v0.2.7 at https://github.com/widdowquinn/pyani, the method 'ANIm' uses\
                         MUMmer (NUCmer) to align the input sequences. 'ANIb' uses BLASTN+ to align 1020nt fragments\
                         of the input sequences. 'ANIblastall': uses the legacy BLASTN to align 1020nt fragments\
                         Finally, 'TETRA': calculates tetranucleotide frequencies of each input sequence",\
                         choices=['ANIm', 'ANIb', 'ANIblastall', 'TETRA'])
    parser.add_argument(*anvio.A('distance'), **anvio.K('distance', {'help': 'The distance metric for the hierarchical \
                         clustering. The default is "%(default)s".'}))
    parser.add_argument(*anvio.A('linkage'), **anvio.K('linkage', {'help': 'The linkage method for the hierarchical \
                         clustering. The default is "%(default)s".'}))
    parser.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))

    args = anvio.get_args(parser)

    try:
        main(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-1)
