#!/usr/bin/env python
# -*- coding: utf-8
"""Export aligned sequences from anvi'o pan genomes"""

import os
import sys
import argparse

import anvio
import anvio.dbops as dbops
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths
import anvio.summarizer as summarizer

from anvio.errors import ConfigError, FilesNPathsError, DictIOError


__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = ["Ryan Bartelme", "Jay Osvatic"]
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"
__requires__ = ['pan-db', 'genomes-storage-db']
__provides__ = ['genes-fasta', 'concatenated-gene-alignment-fasta']
__description__ = "Do cool stuff with gene clusters in anvi'o pan genomes"


run = terminal.Run()
progress = terminal.Progress()


def main(args):
    if args.gene_cluster_id and args.gene_cluster_ids_file:
        raise ConfigError('You should either declare a single gene cluster name, or gene cluster names in a file')

    if (args.gene_cluster_id or args.gene_cluster_ids_file) and args.collection_name:
        raise ConfigError('You can either declare specific list of gene clusters to work with (through `--gene-cluster-id` or `--gene-cluster-ids-file`) or '
                          'go the collection way using parameters `--collection-name` and `--bin-name`. Those are not to be '
                          'mixed. If you need to know what collections are available in the pan database, use the flag '
                          '`--list-collections`.')

    if not args.output_file:
        args.output_file = os.path.join(os.path.dirname(os.path.abspath(args.pan_db)), 'gene-cluster-alignments-output-with-an-ugly-name.fa')

    if not args.concatenate_gene_clusters and args.align_with:
        raise ConfigError("If you are not asking for concatenated gene clusters, then you really don't need to specify an aligner. "
                          "Yes, anvi'o could ignore that flag and move on, but who would we become if we get used to ignoring those "
                          "little signs of 'I am not sure what I am doing'?")

    if not args.concatenate_gene_clusters and args.partition_file:
        raise ConfigError("Partition files are only relevant when you use the flag `--concatenate-gene-clusters`.")

    filesnpaths.is_output_file_writable(args.output_file)
    filesnpaths.is_output_file_writable(args.partition_file) if args.partition_file else None

    gene_cluster_ids = set([])
    if args.collection_name or args.list_collections or args.list_bins:
        progress.new('Initializing')
        progress.update('...')
        pan = summarizer.PanSummarizer(args, r=terminal.Run(verbose=False), p=terminal.Progress(verbose=False))
        progress.end()

        if not args.bin_id:
            raise ConfigError("When you use a collection name, you must also declare a bin id :/ You don't know what bin you want? Use the flag "
                              "`--list-bins`.")

        pan.collections.is_bin_in_collection(collection_name=args.collection_name, bin_name=args.bin_id)
        collection_dict = pan.collections.get_collection_dict(args.collection_name)
        gene_cluster_ids = set(collection_dict[args.bin_id])

        run.info('Mode', 'Working with gene clusters in collection %s and bin %s.' % (args.collection_name, args.bin_id))
    elif (args.gene_cluster_id or args.gene_cluster_ids_file):
        if args.gene_cluster_id:
            gene_cluster_ids = set([args.gene_cluster_id])
            run.info('Mode', 'Working with a single gene cluster.')
        else:
            columns = utils.get_columns_of_TAB_delim_file(args.gene_cluster_ids_file, include_first_column=True)
            if len(columns) != 1:
                raise ConfigError("The input file for gene cluster IDs must contain a single column. It seems yours has %d :/" % len(columns))

            gene_cluster_ids = set([p.strip('\n') for p in open(args.gene_cluster_ids_file, 'r').readlines()])
            run.info('Mode', 'Reporting alignments for a list of gene clusters from an input file.')
    else:
        run.info('Mode', 'Working with all gene clusters.')

    pan = dbops.PanSuperclass(args)
    pan.init_gene_clusters(gene_cluster_ids)

    # this will simply return pan.gene_clusters if there are no filters requested
    filtered_gene_clusters_dict = pan.filter_gene_clusters_dict(args)

    num_gene_clusters = len(filtered_gene_clusters_dict)
    num_genes = sum([len(gc.keys()) for gc in filtered_gene_clusters_dict.values()])
    run.warning("Your filters resulted in %d gene clusters that contain a total of %d genes. "
                "for downstream analyses. Just so you know." % (num_gene_clusters, num_genes), header="INFO", lc="green")

    if args.dry_run:
        run.info_single("Dry run, eh? Well. Your dry run is done!")
        sys.exit(0)

    if args.concatenate_gene_clusters:
        pan.write_sequences_in_gene_clusters_for_phylogenomics(gene_clusters_dict=filtered_gene_clusters_dict,
                                                               output_file_path=args.output_file,
                                                               report_DNA_sequences=args.report_DNA_sequences,
                                                               align_with=args.align_with,
                                                               separator=args.separator,
                                                               partition_file_path=args.partition_file)
    else:
        pan.write_sequences_in_gene_clusters_to_file(gene_clusters_dict=filtered_gene_clusters_dict,
                                                     output_file_path=args.output_file,
                                                     report_DNA_sequences=args.report_DNA_sequences)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__description__)

    groupA = parser.add_argument_group('INPUT FILES', "Input files from the pangenome analysis.")
    groupA.add_argument(*anvio.A('pan-db'), **anvio.K('pan-db'))
    groupA.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': False}))

    groupB = parser.add_argument_group('OUTPUT', "You get to chose an output file name to report things. The default will be\
                                       an ugly name. So, be explicit.")
    groupB.add_argument(*anvio.A('output-file'), **anvio.K('output-file',{'metavar':'FASTA'}))
    groupB.add_argument(*anvio.A('report-DNA-sequences'), **anvio.K('report-DNA-sequences'))

    groupC = parser.add_argument_group('SELECTION', "Which gene clusters should be reported. You can ask for a single gene cluster,\
                                       or multiple ones listed in a file, or you can use a collection and bin name to list gene clusters\
                                       of interest. If you give nothing, this program will export alignments for every single gene cluster\
                                       found in the profile database (and this is called 'customer service').")
    groupC.add_argument(*anvio.A('gene-cluster-id'), **anvio.K('gene-cluster-id'))
    groupC.add_argument(*anvio.A('gene-cluster-ids-file'), **anvio.K('gene-cluster-ids-file'))
    groupC.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name'))
    groupC.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id'))

    groupD = parser.add_argument_group('ADVANCED FILTERS', "If you are here you must be looking for ways to specify\
                                        exactly what you want from that database of gene clusters. These filters will\
                                        be applied to what your previous selections reported.")
    groupD.add_argument(*anvio.A('min-num-genomes-gene-cluster-occurs'), **anvio.K('min-num-genomes-gene-cluster-occurs'))
    groupD.add_argument(*anvio.A('max-num-genomes-gene-cluster-occurs'), **anvio.K('max-num-genomes-gene-cluster-occurs'))
    groupD.add_argument(*anvio.A('min-num-genes-from-each-genome'), **anvio.K('min-num-genes-from-each-genome'))
    groupD.add_argument(*anvio.A('max-num-genes-from-each-genome'), **anvio.K('max-num-genes-from-each-genome'))
    groupD.add_argument(*anvio.A('max-num-gene-clusters-missing-from-genome'), **anvio.K('max-num-gene-clusters-missing-from-genome'))
    groupD.add_argument(*anvio.A('min-functional-homogeneity-index'), **anvio.K('min-functional-homogeneity-index'))
    groupD.add_argument(*anvio.A('max-functional-homogeneity-index'), **anvio.K('max-functional-homogeneity-index'))
    groupD.add_argument(*anvio.A('min-geometric-homogeneity-index'), **anvio.K('min-geometric-homogeneity-index'))
    groupD.add_argument(*anvio.A('max-geometric-homogeneity-index'), **anvio.K('max-geometric-homogeneity-index'))
    groupD.add_argument(*anvio.A('min-combined-homogeneity-index'), **anvio.K('min-combined-homogeneity-index'))
    groupD.add_argument(*anvio.A('max-combined-homogeneity-index'), **anvio.K('max-combined-homogeneity-index'))
    groupD.add_argument(*anvio.A('add-into-items-additional-data-table'), **anvio.K('add-into-items-additional-data-table'))

    groupE = parser.add_argument_group('OTHER STUFF', "Yes. Stuff that are not like the ones above.")
    groupE.add_argument(*anvio.A('list-collections'), **anvio.K('list-collections'))
    groupE.add_argument(*anvio.A('list-bins'), **anvio.K('list-bins'))

    groupF = parser.add_argument_group('PHYLOGENOMICS', "Get separately aligned and concatenated sequences for phylogenomics.")
    groupF.add_argument(*anvio.A('concatenate-gene-clusters'), **anvio.K('concatenate-gene-clusters'))
    groupF.add_argument(*anvio.A('partition-file'), **anvio.K('partition-file'))
    groupF.add_argument(*anvio.A('separator'), **anvio.K('separator'))
    groupF.add_argument(*anvio.A('align-with'), **anvio.K('align-with'))
    groupF.add_argument(*anvio.A('list-aligners'), **anvio.K('list-aligners'))

    groupG = parser.add_argument_group('LIFE SAVERS', "Just when you need them.")
    groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
    groupG.add_argument(*anvio.A('dry-run'), **anvio.K('dry-run'))


    args = anvio.get_args(parser)

    try:
        main(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
    except DictIOError as e:
        print(e)
        sys.exit(-3)
