#!/usr/bin/env python
# -*- coding: utf-8
"""Searches contigs database for a given function"""

import sys

import anvio
import anvio.utils as utils
import anvio.terminal as terminal

from anvio.errors import ConfigError, FilesNPathsError
from anvio.dbops import ContigsSuperclass, PanSuperclass


__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"
__requires__ = ['contigs-db', 'genomes-storage-db',]
__provides__ = ['functions-txt',]
__description__ = ("Search functions in an anvi'o contigs database or genomes storage. Basically, this program searches for one "
                   "or more search terms you define in functional annotations of genes in an anvi'o contigs database, and generates "
                   "multiple reports. The simpler report (which also is the default one) simply tells you which contigs contain genes "
                   "with functions matching to serach terms you used. This file is only useful to quickly highlight matching contigs "
                   "in the interface by providing it to the anvi-interactive with the `--additional-layer` parameter. You can also "
                   "request a much more comprehensive report, which gives you anything you might need to know, including the matching "
                   "gene caller id, functional annotation source, and full function name for each hit and serach term.")


run = terminal.Run()
progress = terminal.Progress()

class SearchResultReporter(object):
    def __init__(self, args):
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.annotation_sources = A('annotation_sources')
        self.list_annotation_sources = A('list_annotation_sources')
        self.basic_report_path = A('output_file') or 'search_results.txt'
        self.full_report_path = A('full_report')
        self.include_sequences = A('include_sequences')
        self.verbose = A('verbose')
        self.args = args
        self.search_mode = None

        self.contigs_db = A('contigs_db')
        self.genomes_storage = A('genomes_storage')
        self.pan_db = A('pan_db')

        # get search results will fill these
        self.all_item_names = None
        self.matching_item_names_dict = None
        self.verbose_output = None

        # straighten things out
        self.search_terms = [s.strip() for s in A('search_terms').split(A('delimiter'))]
        self.annotation_sources = [a.strip() for a in A('annotation_sources').split(A('delimiter'))] if A('annotation_sources') else None

        self.db = self.get_database()

        if self.list_annotation_sources:
            self.db.list_function_sources()
            sys.exit()


    def process(self):
        self.get_search_results()
        self.write_report()

        if self.full_report_path:
            self.write_full_report()


    def get_database(self):
        if self.contigs_db and (self.genomes_storage or self.pan_db):
            raise ConfigError("You can not provide both contigs database and genomes storage")

        if self.contigs_db:
            self.search_mode = 'contigs'
            run.info("Searching in Contigs Database", self.contigs_db)
            return ContigsSuperclass(self.args)

        elif self.genomes_storage and self.pan_db:
            self.search_mode = 'gene_clusters'
            run.info("Searching in Genomes Storage", self.genomes_storage)
            pan_database = PanSuperclass(self.args)
            pan_database.init_gene_clusters()
            return pan_database

        else:
            raise ConfigError("You did not provide enough arguments to initialize contigs database or pan database. "
                              "To initialize pan database you need to provide both genome storage and pan database.")


    def get_search_results(self):
        self.matching_item_names_dict, self.verbose_output = self.db.search_for_gene_functions(self.search_terms, requested_sources=self.annotation_sources, verbose = self.verbose)

        self.all_item_names = set([])

        for item_names in list(self.matching_item_names_dict.values()):
            self.all_item_names.update(item_names)


    def write_report(self):
        results_dict = {}

        for item_name in self.all_item_names:
            results_dict[item_name] = dict([(s + '_hits', '') for s in self.search_terms])

            for search_term in self.search_terms:
                if item_name in self.matching_item_names_dict[search_term]:
                    results_dict[item_name][search_term + '_hits'] = search_term

        utils.store_dict_as_TAB_delimited_file(results_dict, self.basic_report_path, headers = [self.search_mode] + [s + '_hits' for s in self.search_terms])
        run.info('Items additional data compatible output', self.basic_report_path)


    def write_full_report(self):
        if self.search_mode == 'contigs':
            header = ['gene_callers_id']
        elif self.search_mode == 'gene_clusters':
            header = ['gene_callers_id', 'genome_name']
        else:
            raise ConfigError("You ended up in a place you should have never ended up. Go back. Go back.")

        header.extend(['source', 'accession', 'function', 'search_term', self.search_mode])

        if self.include_sequences:
            if self.search_mode == 'contigs':
                gene_caller_ids = list(set([e[0] for e in self.verbose_output]))
                _, gene_sequences_dict = self.db.get_sequences_for_gene_callers_ids(gene_caller_ids, include_aa_sequences=True)
                header.extend(['direction', 'rev_compd', 'dna_sequence', 'aa_sequence'])
            elif self.search_mode == 'gene_clusters':
                header.extend(['dna_sequence', 'aa_sequence'])

        report = open(self.full_report_path, 'w')
        report.write('\t'.join(header) + '\n')
        for entry in self.verbose_output:
            content = [str(item) for item in entry]
            if self.include_sequences:
                if self.search_mode == 'contigs':
                    g = gene_sequences_dict[entry[0]]
                    content.extend([g['direction'],
                                    g['rev_compd'],
                                    g['sequence'],
                                    g['aa_sequence'],])
                elif self.search_mode == 'gene_clusters':
                    # pan results already contains dna and aa sequences
                    pass
            else:
                if self.search_mode == 'gene_clusters':
                    content = content[:-2]

            report.write('\t'.join(content) + '\n')
        report.close()

        run.info('Full report', self.full_report_path)


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description=__description__)

    groupA = parser.add_argument_group('SEARCH IN', 'Relevant source databases')
    groupA.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': False}))
    groupA.add_argument(*anvio.A('pan-db'), **anvio.K('pan-db', {'required': False}))
    groupA.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': False}))

    groupB = parser.add_argument_group('SEARCH FOR', 'Relevant terms')
    groupB.add_argument(*anvio.A('search-terms'), **anvio.K('search-terms', {'required': True}))
    groupB.add_argument(*anvio.A('delimiter'), **anvio.K('delimiter'))
    groupB.add_argument(*anvio.A('annotation-sources'), **anvio.K('annotation-sources'))
    groupB.add_argument(*anvio.A('list-annotation-sources'), **anvio.K('list-annotation-sources'))

    groupC = parser.add_argument_group('REPORT', "Anvi'o can report the hits in multiple ways. The output file will be a very simple 2-column\
                                                  TAB-delimited output that is compatible with anvi'o additional data format (so you can give\
                                                  it to the `anvi-interactive` to see which splits contained genes that were matching to your\
                                                  search terms). You can also ask anvi'o to generate a full-report, that contains much more and\
                                                  much helpful information about each hit. Optionally you can even ask the gene sequences to\
                                                  appear in this report.")
    groupC.add_argument(*anvio.A('output-file'), **anvio.K('output-file'))
    groupC.add_argument(*anvio.A('full-report'), **anvio.K('full-report'))
    groupC.add_argument(*anvio.A('include-sequences'), **anvio.K('include-sequences'))
    groupC.add_argument(*anvio.A('verbose'), **anvio.K('verbose'))


    args = anvio.get_args(parser)

    try:
        SearchResultReporter(args).process()
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
