#!/usr/bin/env python
# -*- coding: utf-8

import sys

import anvio
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths
import anvio.ccollections as ccollections

from anvio.errors import ConfigError, FilesNPathsError


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2016, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()
pp = terminal.pretty_print


def main(args):
    filesnpaths.is_file_tab_delimited(args.protein_clusters)

    if not args.profile_db:
        raise ConfigError("Without a profile database, you do not exist as far as this program is concerned. True story :(")

    collections = ccollections.Collections()
    collections.populate_collections_dict(args.profile_db)

    if args.list_collections:
        collections.list_collections()
        sys.exit()

    if args.collection_name not in collections.collections_dict:
        raise ConfigError("%s is not a valid collection ID. See a list of available ones with '--list-collections' flag" % args.collection_name)

    if not args.collection_name:
        raise ConfigError("You need to provide a collection name. You can use `--list-collections` flag if you\
                            don't remember under what collection name you stored your PC bins.")

    PC_bins = collections.get_collection_dict(args.collection_name)
    run.info('Collection "%s"' % args.collection_name, '%s protein cluster bins describes %s PCs.' % (len(PC_bins), pp(sum([len(v) for v in list(PC_bins.values())]))))

    protein_clusters_dict = utils.get_TAB_delimited_file_as_dictionary(args.protein_clusters)
    run.info('Protein clusters', '%s entries are read into the memory.' % pp(len(protein_clusters_dict)))

    genomes = set([v['genome_name'] for v in list(protein_clusters_dict.values())])
    run.info('Genomes found (%d)' % len(genomes), '%s.' % (', '.join(genomes)))

    run.info_single('Storing gene caller ids: ', nl_before=1, nl_after=1)
    # for each genome, we want to identify matching protein clusters in the protein clusters dict
    # this is likely going to become a primary data structure somewhere when we expand into pangenomics more
    # and more.
    for genome in genomes:
        for PC_bin in PC_bins:
            protein_clusters_of_interest = set(PC_bins[PC_bin])
            gene_caller_ids = [v['gene_caller_id'] for v in list(protein_clusters_dict.values()) if v['genome_name'] == genome and v['protein_cluster_id'] in protein_clusters_of_interest]

            output_file_name = 'gene-caller-ids_for-%s-%s-%s.txt' % (args.collection_name, genome, PC_bin)
            open(output_file_name, 'w').write('\n'.join(['%s' % gene_caller_id for gene_caller_id in gene_caller_ids]))
            run.info('Gene calls (%s) [%s/%s]' % (pp(len(gene_caller_ids)), genome, PC_bin), output_file_name)


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Export gene caller ids per genome for a given PC bin. For details,\
                                                  See the pangenomic workflow and an example usage at the URL\
                                                  http://merenlab.org/2015/11/14/pangenomics/')

    parser.add_argument('protein_clusters', help = 'protein-clusters.txt file (one of the key output files of anvi-pan-genome)')

    parser.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db', {'required': False}))
    parser.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name'))
    parser.add_argument(*anvio.A('list-collections'), **anvio.K('list-collections'))

    args = parser.parse_args()

    try:
        main(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-1)
