#!/usr/bin/env python
# -*- coding: utf-8

import sys

import anvio
import anvio.fastalib as u
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError, FilesNPathsError

__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2016, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def remove_short_contigs_from_fasta(args):
    filesnpaths.is_output_file_writable(args.output_file)
    filesnpaths.is_file_exists(args.contigs_fasta)

    report_file = open(args.report_file, 'w') if args.report_file and args.simplify_names else None
    prefix = args.prefix if args.prefix else None

    if prefix:
        utils.is_this_name_OK_for_database('contig name prefix', prefix)

    if args.input_ids:
        filesnpaths.is_file_exists(args.input_ids)
        input_ids = set([l.split('\t')[0].strip() for l in open(args.input_ids, 'rU').readlines()])
        run.info('Input IDs to remove', '%d found' % len(input_ids))
    else:
        input_ids = set([])

    output = u.FastaOutput(args.output_file)
    fasta = u.SequenceSource(args.contigs_fasta)

    run.info('Input', args.contigs_fasta)
    run.info('Output', args.output_file)
    run.info('Minimum length', args.min_len)

    total_num_nucleotides = 0
    total_num_contigs = 0
    total_num_nucleotides_removed = 0
    total_num_contigs_removed = 0

    while next(fasta):
        l = len(fasta.seq)

        total_num_nucleotides += l
        total_num_contigs += 1

        if input_ids and fasta.id.split()[0] in input_ids:
            total_num_nucleotides_removed += l
            total_num_contigs_removed += 1
            continue

        if len(fasta.seq) < args.min_len:
            total_num_nucleotides_removed += l
            total_num_contigs_removed += 1
            continue

        if args.simplify_names:
            if prefix:
                defline = '%s_%012d' % (prefix, fasta.pos)
            else:
                defline = 'c_%012d' % fasta.pos

            output.write_id(defline)
            output.write_seq(fasta.seq, split = False)

            if report_file:
                report_file.write('%s\t%s\n' % (defline, fasta.id))
        else:
            output.store(fasta, split = False)

    if report_file:
        report_file.close()

    fasta.close()
    output.close()

    run.info('Total num contigs', total_num_contigs)
    run.info('Total num nucleotides', total_num_nucleotides)
    run.info('Contigs removed', '%d (%.2f%% of all)' % (total_num_contigs_removed, total_num_contigs_removed * 100.0 / total_num_contigs), mc='green')
    run.info('Nucleotides removed', '%d (%.2f%% of all)' % (total_num_nucleotides_removed, total_num_nucleotides_removed * 100.0 / total_num_nucleotides), mc='green')
    run.info('Deflines simplified', args.simplify_names, mc='green')


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description="Reformat FASTA file (remove contigs based on length, or based on a given list of\
                                                  deflines, and/or generate an output with simpler names)")

    parser.add_argument('contigs_fasta')
    parser.add_argument('-l', '--min-len', type=int, default=0, metavar='MIN_LENGTH',
                        help="Minimum length of contigs to keep (contigs shorter than this value\
                              will not be included in the output file). The default is %(default)d,\
                              so nothing will be removed if you do not declare a minimum size.")
    parser.add_argument('-i', '--input-ids', required=False, metavar='TXT FILE',
                        help="IDs to remove from the FASTA file.")
    parser.add_argument('-o', '--output-file', required=True, metavar='FASTA FILE',
                        help="Output file path.")
    parser.add_argument('--simplify-names', default=False, action="store_true",
                        help="Edit deflines to make sure they contigs have simple names.")
    parser.add_argument('--prefix', default=None, metavar="PREFIX",
                        help="Use this parameter if you would like to add a prefix to your contig\
                              names while simplifying them. The prefix must be a single word (you\
                              can use underscor character, but nothing more!).")
    parser.add_argument('-r', '--report-file', required=False, metavar='REPORT FILE',
                        help="Report file path. When you run this program with `--simplify-names`\
                              flag, all changes to deflines will be reported in this file\
                              in case you need to go back to this information later. It is not\
                              mandatory to declare one, but it is a very good idea to have it.")


    args = parser.parse_args()

    try:
        remove_short_contigs_from_fasta(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
