#!/usr/bin/env python
# -*- coding: utf-8
import os
import re
import sys
import argparse
from ete3 import Tree

import anvio
import anvio.utils as utils
import anvio.tables as tables
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths
from anvio.errors import ConfigError, FilesNPathsError

__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "Özcan Esen"
__email__ = "ozcanesen@gmail.com"
__resources__ = [("The context in which the script was conceived", "https://anvio.slack.com/archives/C8SFMGYF3/p1518192776000335")]


run = terminal.Run()
progress = terminal.Progress()

taxon_letter_to_name = dict((column[2], column[2:]) for column in tables.taxon_names_table_structure[1:])


def process(input_fasta_file, output_dir):
    filesnpaths.is_file_exists(input_fasta_file)
    filesnpaths.check_output_directory(output_dir)

    with open(input_fasta_file, 'r') as f:
        newick = f.read()

    run.info('Input tree found', input_fasta_file)

    uid_regex = re.compile(r"'?(\w+)\|(.*?)\|([\.eE\-0-9]+)?'?")
    uid_to_taxonomy = {}

    for match in re.finditer(uid_regex, newick):
        uid = match.group(1)
        taxonomy_list = match.group(2).split(';')

        for taxonomy_item in taxonomy_list:
            if len(taxonomy_item):
                # format of taxonomy_item is:
                # o__Planctomycetales
                taxon_letter, full_name = taxonomy_item[0], taxonomy_item[3:]

                if taxon_letter not in taxon_letter_to_name:
                    continue

                taxon = taxon_letter_to_name[taxon_letter]

                if uid not in uid_to_taxonomy:
                    uid_to_taxonomy[uid] = {}

                uid_to_taxonomy[uid][taxon] = full_name

    newick = re.sub(uid_regex, r'\1', newick)
    tree = Tree(newick, format=1)

    output_dict = {}
    for leaf in tree.get_leaves():
        output_dict[leaf.name] = dict((taxon, '') for taxon in taxon_letter_to_name.values())

    for node in tree.traverse("levelorder"):
        if node.name in uid_to_taxonomy:
            for leaf in node.get_leaves():
                for taxon in uid_to_taxonomy[node.name]:
                    output_dict[leaf.name][taxon] = uid_to_taxonomy[node.name][taxon]

    os.mkdir(output_dir)
    output_newick_path = os.path.join(output_dir, 'newick.tree')
    output_view_data_path = os.path.join(output_dir, 'view_data.txt')

    run.info('Output tree path', output_newick_path)
    run.info('Output view data path', output_view_data_path)

    utils.store_dict_as_TAB_delimited_file(output_dict, output_view_data_path, 
        headers=['items'] + list(taxon_letter_to_name.values()))

    with open(output_newick_path, 'w') as f:
        f.write(newick)

    run.info_single("Successfully converted.", nl_before=1, nl_after=1)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Reformat FASTA file (remove contigs based on length, or based on a given list of\
                                                  deflines, and/or generate an output with simpler names)")
    parser.add_argument('-t', '--tree', required=True, metavar='CHECKM TREE',
                            help="Tree file generated by CheckM.")
    parser.add_argument('-o', '--output-dir', required=True, metavar='DIRECTORY', 
                            help="The directory name that output files will be stored.")

    args = anvio.get_args(parser)

    try:
        process(args.tree, args.output_dir)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
