#!/usr/bin/env python
# -*- coding: utf-8
"""A script to import collections (and their colors)"""

import sys
from collections import Counter

import anvio
import anvio.tables as t
import anvio.dbops as dbops
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError, FilesNPathsError


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()


def main(args):
    sanity_check(args)

    we_have_a_contigs_db = True if args.contigs_db else False

    # initiate the contigs database if it is present
    if we_have_a_contigs_db:
        contigs_db = dbops.ContigsDatabase(args.contigs_db)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_basic_info = contigs_db.db.get_table_as_dict(t.contigs_info_table_name, string_the_key = True)
        contig_name_to_splits_dict = utils.get_contig_name_to_splits_dict(splits_basic_info, contigs_basic_info)
        contigs_db.disconnect()

        run.info('Num contigs in the contigs database', len(contigs_basic_info))
        run.info('Num splits in the contigs database', len(splits_basic_info))

    I = lambda: 'contigs' if args.contigs_mode else 'splits'

    # read the input file with split/contig - bin ID associations
    input_data_file_content = utils.get_TAB_delimited_file_as_dictionary(args.data, no_header = True, column_names = ['split_id', 'bin_name'])

    # populate bins_info_dict there is any information about bins
    bins_info_dict = {}
    if args.bins_info:
        try:
            bins_info_dict = utils.get_TAB_delimited_file_as_dictionary(args.bins_info, no_header = True, column_names = ['bin_name', 'source', 'html_color'])
        except Exception as e:
            raise ConfigError("Someone was not happy with the TAB-delimited bins info file you provided. Here\
                                is the complaint: %s" % e)
    run.info('Num %s in the input file' % I(), len(input_data_file_content))
    run.info('Num bins the input file describes', len(set([e['bin_name'] for e in list(input_data_file_content.values())])))

    # learning about the input names like a pro
    input_names = set(input_data_file_content.keys())

    # here we attempt to make sure the names in the input file are relevant to the
    # names in the contigs database database. but clearly it is not relevant if there
    # is no contigs database is associated with the profile database, so if there is none,
    # we cheat :
    if we_have_a_contigs_db:
        db_names = set(list(contigs_basic_info.keys()) if args.contigs_mode else list(splits_basic_info.keys()))
    else:
        db_names = input_names

    entry_names_overlap_between_input_and_db = set.intersection(*[db_names, input_names])

    if not len(entry_names_overlap_between_input_and_db):
        raise ConfigError("There is no overlap between the %(item)s names found in your input file\
                            and %(item)s names found in the database. This is odd! For instance one\
                            of the names from your file looks like this: '%(from_file)s', and this is\
                            an example name from the database: '%(from_db)s'. Please issue a report if\
                            you are almost certain that this is anvi'o's fault." \
                                % {'item': I(), 'from_file': input_names.pop(), 'from_db': db_names.pop()})

    run.info('Num %s both input file and the db has' % I(), len(entry_names_overlap_between_input_and_db))

    data = {}

    # populate the data dictionary
    for entry_name in input_data_file_content:
        bin_name = input_data_file_content[entry_name]['bin_name']
        if entry_name not in entry_names_overlap_between_input_and_db:
            continue

        if bin_name not in data:
            data[bin_name] = set([])

        if args.contigs_mode:
            for split_name in contig_name_to_splits_dict[entry_name]:
                data[bin_name].add(split_name)
        else:
            data[bin_name].add(entry_name)

    if args.contigs_mode:
        run.info_single('%d of %d contig names in the input file that matched to contig names in the database\
                         resolved into %d split names that described in %d bins.' % (len(entry_names_overlap_between_input_and_db),
                                                                                     len(input_data_file_content),
                                                                                     sum([len(v) for v in list(data.values())]),
                                                                                     len(data)), nl_before = 1)

    if args.pan_or_profile_db:
        collections = dbops.TablesForCollections(args.pan_or_profile_db)
    else:
        collections = dbops.TablesForCollections(args.contigs_db)

    collections.append(args.collection_name, data, bins_info_dict)


def sanity_check(args):
    if not args.pan_or_profile_db and not args.contigs_db:
        raise ConfigError("You must at least provide an anvi'o contigs database for this to work :(")

    if args.pan_or_profile_db:
        dbops.is_pan_or_profile_db(args.pan_or_profile_db)

    if args.contigs_db:
        dbops.is_contigs_db(args.contigs_db)

    if args.pan_or_profile_db and dbops.get_db_type(args.pan_or_profile_db) == 'pan' and args.contigs_db: 
        raise ConfigError("There is no need to provide a contigs database when you are working with an anvi'o pan\
                            database")

    if not args.contigs_db and args.contigs_mode:
        raise ConfigError("There is no reason for you to use the `--contigs-mode` flag when you have\
                            not declared an anvi'o contigs database")

    # if there is a profile database, check whether there is a contigs database associated with the profile
    if args.pan_or_profile_db and dbops.get_db_type(args.pan_or_profile_db) == 'profile':
        pan_or_profile_db = dbops.ProfileDatabase(args.pan_or_profile_db)

        if pan_or_profile_db.meta['contigs_db_hash'] and not args.contigs_db:
            raise ConfigError("The profile database you provided is associated with an anvi'o contigs database (i.e.,\
                                it is not 'blank' or an ad hoc profile database). In this case, you must provide a path\
                                for the contigs database. Sorry :/")

    if not args.collection_name:
        raise ConfigError("You must give a name for this collection.")

    if not args.contigs_db:
        run.warning("You did not provide a contigs database. Fine. So be it. But know this: anvi'o has no way to check\
                     the consistency of names you provide in the input file. So if you made a mistake while generating\
                     this collection, it probably will cause issues later on.")

    if not args.pan_or_profile_db:
        run.warning("Since you haven't provided an anvi'o profile database, this program will add your collection into\
                     the contigs database you provided. If you use the same collection name later in one of your profile\
                     databases that will be generated from this contigs database, things may go South, and anvi'o would\
                     not even care.")

    try:
        utils.check_sample_id(args.collection_name)
    except:
        raise ConfigError('"%s" is not a proper collection name. A proper one should be a single word and not contain\
                            ANY characters but digits, ASCII letters and underscore character(s). There should not be\
                            any space characters, and the collection name should not start with a digit.' % args.collection_name)

    filesnpaths.is_file_tab_delimited(args.data, expected_number_of_fields = 2)
    if args.bins_info:
        filesnpaths.is_file_tab_delimited(args.bins_info, expected_number_of_fields = 3)

    num_occurences_of_entries = Counter([l.split('\t')[0] for l in open(args.data).readlines()])
    if max(num_occurences_of_entries.values()) != 1:
        raise ConfigError("Some %(item)s names occur more than once in the input file. A %(item)s cannot belong in two\
                            bins, and neither there should be the same bin assignment for a given %(item)s. Long story\
                            short, each name should appear only once in your input file, and it is not the case :/" \
                                                                        % {'item': 'contig' if args.contigs_mode else 'split'})



if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description="Import an external binning result into anvi'o")

    parser.add_argument('data', metavar = "TAB DELIMITED FILE",
                        help = 'The input file that describes bin IDs for each split or contig.')

    parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': False}))
    parser.add_argument(*anvio.A('pan-or-profile-db'), **anvio.K('pan-or-profile-db', {'required': False}))
    parser.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name', {'required': True}))
    parser.add_argument(*anvio.A('bins-info'), **anvio.K('bins-info'))
    parser.add_argument(*anvio.A('contigs-mode'), **anvio.K('contigs-mode'))

    args = parser.parse_args()


    try:
        main(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
