#!/usr/bin/env python
# -*- coding: utf-8
"""Get sequences for all HMM hits in a given bin.

   This program takes a profile database, a collection ID, and a bin name, and an
   HMM source, and returnes sequences of HMM hits. This program is useful when you
   want to get actual sequencs for each single-copy gene hit in a particular genome
   bin.

  You want to play with it? This is how you could quickly test it:

  Downloaded the anvi'o data pack for the infant gut data, which is here:

    https://ndownloader.figshare.com/files/8252861

  Uunpack it and went into it:

    tar -zxvf INFANTGUTTUTORIAL.tar.gz && cd INFANT-GUT-TUTORIAL

  Import the collection `merens`:

    anvi-import-collection additional-files/collections/merens.txt -p PROFILE.db -c CONTIGS.db -C merens

  Then I run the program `anvi-get-sequences-for-hmm-hits` in the anvi'o master this way:

    anvi-get-sequences-for-hmm-hits -p PROFILE.db \
                                    -c CONTIGS.db \
                                    -C merens \
                                    -o OUTPUT.fa \
                                    --hmm-source Campbell_et_al \
                                    --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 \
                                    --return-best-hit \
                                    --get-aa-sequences \
                                    --concatenate
"""

import os
import sys
import argparse

import anvio
import anvio.dbops as dbops
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.ccollections as ccollections

from anvio.dbops import ContigsSuperclass
from anvio.hmmops import SequencesForHMMHits
from anvio.errors import ConfigError, FilesNPathsError


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def main(args):
    gene_names = [g.strip() for g in args.gene_names.split(',')] if args.gene_names else []
    hmm_sources = set([s.strip() for s in args.hmm_sources.split(',')]) if args.hmm_sources else set([])

    if args.list_hmm_sources:
        info_table = SequencesForHMMHits(args.contigs_db).hmm_hits_info
        for source in info_table:
            t = info_table[source]
            run.info_single('%s [type: %s] [num genes: %d]' % (source, t['search_type'], len(t['genes'].split(','))))
        sys.exit(0)

    if args.list_available_gene_names:
        info_table = SequencesForHMMHits(args.contigs_db).hmm_hits_info
        for source in hmm_sources or info_table:
            t = info_table[source]
            run.info_single('%s [type: %s]: %s' % (source, t['search_type'], ', '.join(sorted(t['genes'].split(',')))), nl_after = 2)
        sys.exit(0)

    # let's do some initial checks if the user is interested in concatanating genes. FIXME: these should probably go into the
    # SequencesForHMMHits class sanity check.
    if args.concatenate_genes:
        if not args.return_best_hit:
            raise ConfigError("If you want your genes to be concatenated into a multi-alignment file, you must also ask for\
                               the best hit (using the `--report-best-hit`) flag to avoid issues if there are more than one\
                               hit for a gene in a given genome. Anvi'o could have set this flag on your behalf, but it just\
                               is not that kind of a platform :/")

        if not len(gene_names):
            run.warning("You did not define any gene names. Bold move. Now anvi'o will attempt to report a file with all\
                         genes defined in the HMM source '%s'." % list(hmm_sources)[0])


    if args.profile_db and not args.collection_name:
        raise ConfigError("You can't use this program with a profile database but without a collection name. Yes. Because.")

    if args.profile_db:
        dbops.is_profile_db_and_contigs_db_compatible(args.profile_db, args.contigs_db)
        splits_dict = ccollections.GetSplitNamesInBins(args).get_dict()
        run.info('Init', '%d splits in %d bin(s)' % (sum([len(v) for v in list(splits_dict.values())]), len(splits_dict)))
    else:
        contigs_db = ContigsSuperclass(args, r = run, p = progress)
        splits_dict = {os.path.basename(args.contigs_db[:-3]): list(contigs_db.splits_basic_info.keys())}

    s = SequencesForHMMHits(args.contigs_db, sources = hmm_sources)

    hmm_sequences_dict = s.get_sequences_dict_for_hmm_hits_in_splits(splits_dict, return_amino_acid_sequences=args.get_aa_sequences)

    run.info('Hits', '%d hits for %d source(s)' % (len(hmm_sequences_dict), len(s.sources)))

    if len(gene_names):
        hmm_sequences_dict = utils.get_filtered_dict(hmm_sequences_dict, 'gene_name', set(gene_names))
        run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))

    if not hmm_sequences_dict:
        raise ConfigError("Your selections resulted in 0 hits. There is nothing to report. Are you\
                            sure you have the right set of gene names and sources? If you\
                            select an HMM source, and then use gene names that belong to another\
                            source, the intersection of the two can be empty. Just saying.")

    if args.return_best_hit:
        run.warning("You requested only the best hits to be reported, which means, if, say, there are more than one RecA\
                     hits in a bin for a given HMM source, only the one with the lowest e-value will be kept, and others\
                     will be removed from your final results.")

        if not args.profile_db:
            run.warning("You requested to get only the best hits, but you did not provide a profile database. At this point\
                         anvi'o just hopes you know what you are doing. Since this is like the zone of 'potentially a terrible\
                         idea but it may be quite relevant when done right'.")

        hmm_sequences_dict = s.filter_hmm_sequences_dict_for_splits_to_keep_only_best_hits(hmm_sequences_dict)

        run.info('Filtered hits', '%d hits remain after removing weak hits for multiple hits' % (len(hmm_sequences_dict)))

    # FIXME: We need the user to be able to define this:
    if args.separator:
        separator = args.separator
    else:
        separator = 'XXX' if args.get_aa_sequences else 'NNN'

    # the magic is happening here:
    s.store_hmm_sequences_into_FASTA(hmm_sequences_dict, \
                                     args.output_file, \
                                     concatenate_genes=args.concatenate_genes, \
                                     separator=separator, \
                                     genes_order=list(gene_names) if len(gene_names) else None)

    run.info('Mode', 'AA seqeunces' if args.get_aa_sequences else 'DNA seqeunces', mc='green')
    run.info('Genes are concatenated', args.concatenate_genes)
    run.info('Output', args.output_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Get sequences for HMM hits")

    parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db'))
    parser.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db', {'required': False}))
    parser.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name'))
    parser.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id'))
    parser.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file'))
    parser.add_argument(*anvio.A('hmm-sources'), **anvio.K('hmm-sources'))
    parser.add_argument(*anvio.A('gene-names'), **anvio.K('gene-names'))
    parser.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'required': True}))
    parser.add_argument(*anvio.A('list-hmm-sources'), **anvio.K('list-hmm-sources'))
    parser.add_argument(*anvio.A('list-available-gene-names'), **anvio.K('list-available-gene-names'))
    parser.add_argument(*anvio.A('get-aa-sequences'), **anvio.K('get-aa-sequences'))
    parser.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
    parser.add_argument(*anvio.A('concatenate-genes'), **anvio.K('concatenate-genes'))
    parser.add_argument(*anvio.A('separator'), **anvio.K('separator', {'help': 'A word that will be used to\
                                  sepaate concatenated gene sequences from each other (IF you are using this\
                                  program with `--concatenate-genes` flag). The default is "XXX" for amino\
                                  acid sequences, and "NNN" for DNA sequences'}))

    args = parser.parse_args()

    try:
        main(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-1)
