#!/usr/bin/env python
# -*- coding: utf-8
"""Get sequences for all HMM hits in a given bin.

   This program takes a profile database, a collection ID, and a bin name, and an
   HMM source, and returnes sequences of HMM hits. This program is useful when you
   want to get actual sequencs for each single-copy gene hit in a particular genome
   bin.
"""

import sys
import argparse

import anvio
import anvio.terminal as terminal
import anvio.ccollections as ccollections

from anvio.errors import ConfigError, FilesNPathsError
from anvio.hmmops import SequencesForHMMHits


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def main(args):
    if args.list_available_hmm_sources:
        info_table = SequencesForHMMHits(args.contigs_db).search_info_table
        for source in info_table:
            t = info_table[source]
            run.info_single('%s [type: %s] [num genes: %d]' % (source, t['search_type'], len(t['genes'].split(','))))
        sys.exit(0)

    splits_dict = ccollections.GetSplitNamesInBins(args).get_dict()
    run.info('Init', '%d splits in %d bin(s)' % (sum([len(v) for v in splits_dict.values()]), len(splits_dict)))

    sources = set([s.strip() for s in args.hmm_sources.split(',')]) if args.hmm_sources else set([])
    s = SequencesForHMMHits(args.contigs_db, sources = sources)

    hmm_sequences_dict = s.get_hmm_sequences_dict_for_splits(splits_dict)
    run.info('Result', '%d genes for %d source(s)' % (len(hmm_sequences_dict), len(s.sources)))

    s.store_hmm_sequences_into_FASTA(hmm_sequences_dict, args.output_file)
    run.info('Output', args.output_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Get sequences for all HMM hits in a given bin")

    parser.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db'))
    parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db'))
    parser.add_argument(*anvio.A('collection-id'), **anvio.K('collection-id'))
    parser.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id'))
    parser.add_argument(*anvio.A('bin-ids-file'), **anvio.K('bin-ids-file'))
    parser.add_argument(*anvio.A('hmm-sources'), **anvio.K('hmm-sources'))
    parser.add_argument(*anvio.A('output-file'), **anvio.K('output-file'))
    parser.add_argument(*anvio.A('list-available-hmm-sources'), **anvio.K('list-available-hmm-sources'))

    args = parser.parse_args() 
    
    try:
        main(args)
    except ConfigError, e:
        print e
        sys.exit(-1)
    except FilesNPathsError, e:
        print e
        sys.exit(-1)
