#!/usr/bin/env python
# -*- coding: utf-8

import sys
import argparse

import anvio
import anvio.dbops as dbops
import anvio.utils as utils
import anvio.hmmops as hmmops
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError, FilesNPathsError


__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def main(args):
    A = lambda x: args.__dict__[x] if x in args.__dict__ else None
    input_contig_db_paths = A('input')
    hmm_source = A('source')
    list_sources = A('list_sources')
    output_file_path = A('output_file')

    if not hmm_source:
        raise ConfigError("You need to declare an HMM source for this to work :/")

    if not output_file_path:
        raise ConfigError("This will not work without an output file path.")

    gene_names = None
    filesnpaths.is_output_file_writable(output_file_path)

    if list_sources:
        hmmops.SequencesForHMMHits(input_contig_db_paths[0]).list_available_hmm_sources()
    else:
        gene_names = [g.strip() for g in hmmops.SequencesForHMMHits(input_contig_db_paths[0]).hmm_hits_info[hmm_source]['genes'].split(',')]

    d = {}
    progress.new('Processing congits databases')
    for i in range(0, len(input_contig_db_paths)):
        progress.update('%d of %d' % (i + 1, len(input_contig_db_paths)))

        contigs_db_path = input_contig_db_paths[i]
        contigs_db = dbops.ContigsDatabase(contigs_db_path)
        contigs_name = contigs_db.meta['project_name']
        contigs_db.disconnect()

        d[contigs_name] = {}

        hmm = hmmops.SequencesForHMMHits(contigs_db_path, sources=set([hmm_source]), progress=terminal.Progress(verbose=False))
        for gene_name in [h['gene_name'] for h in hmm.hmm_hits.values()]:
            d[contigs_name][gene_name] = 1

        for gene_name in gene_names:
            if gene_name not in d[contigs_name]:
                d[contigs_name][gene_name] = 0
    progress.end()

    utils.store_dict_as_TAB_delimited_file(d, output_file_path, headers=['contigs_db'] + sorted(gene_names))

    run.info('Output', output_file_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='A simple script to generate a TAB-delimited file for the presence\
                                                  or absence of HMM hits in a given set of contigs databases and an\
                                                  HMM source.')

    parser.add_argument('input', metavar = 'CONTIG DATABASE(S)', nargs='+',
                        help = "One or more anvi'o contigs databases.")
    parser.add_argument('--list-sources', action='store_true', default=False,
                        help = 'Show available single-copy gene sources and exit.')
    parser.add_argument('--source', default=None,
                        help = 'A single HMM source to focus on. This HMM source should be found in all contigs\
                                databases.')
    parser.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'required': True }))

    args, unknown = parser.parse_known_args()

    try:
        sys.exit(main(args))
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
