#!/usr/bin/env python

# Created on Mon Nov 03 19:32:33 2014

# Author: XiaoTao Wang
# Organization: HuaZhong Agricultural University

## Required modules
from __future__ import division
import argparse, sys, logging, logging.handlers

def getargs():
    ## Construct an ArgumentParser object for command-line arguments
    parser = argparse.ArgumentParser(usage = '%(prog)s [options]\n\n'
                                     'CALFEA -- CALculate FEAture for TADs',
                                    description = '''The calculation is implemented by identifying
                                    long-range significant interactions for each TAD and looking for
                                    the aggregation patterns thereof. For more details, please
                                    refer to online TADLib documentation.''',
                                    formatter_class = argparse.ArgumentDefaultsHelpFormatter)

    # Version
    parser.add_argument('-v', '--version', action = 'version', version = '%(prog)s 0.2.4',
                        help = 'Print version number and exit')

    # Output
    parser.add_argument('-f', '--filename', type = argparse.FileType('w'),
                        default = sys.stdout, help = 'Output file name.')
    
    # About Logging
    parser.add_argument('--verbose', type = int, default = 2, choices = [0, 1, 2, 3],
                        help = 'Set logging level. 0: only show error messages, 1: also report '
                               'warnings, 2: show process information, 3: debug. '
                               'Note: We log all messages of all levels to a disk file'
                               '(calfea.log), while simultaneously logging process '
                               'information or above to the console.')
    
    ## Argument Groups
    ## First one, about Hi-C data
    group_1 = parser.add_argument_group(title = 'Relate to Hi-C data:')
    group_1.add_argument('-p', '--path', default = '.',
                         help = 'Path to the folder with Hi-C data. Support both absolute'
                                ' and relative path.')
    group_1.add_argument('-F', '--Format', default = 'TXT', choices = ['TXT', 'NPZ'],
                         help = 'Format of source data file')
    group_1.add_argument('-R', '--resolution', default = 10000, type = int,
                         help = 'Resolution of the binned data')
    group_1.add_argument('-T', '--template', default = 'chr%s_chr%s.int',
                         help = 'Template for matching file names using regular expression.'
                         ' Needed when "--Format" is set to be "TXT". Note only within-chromosome'
                         ' data are allowed, and don\'t place inter-chromosome data under the '
                         'folder.')
    group_1.add_argument('-C', '--chroms', nargs = '*', default = ['#', 'X'],
                         help = 'Which chromosomes to read. Specially, "#" stands'
                         ' for chromosomes with numerical labels. "--chroms" with zero argument'
                         ' will generate an empty list, in which case all chromosome data will'
                         ' be loaded.')
    group_1.add_argument('-c', '--cols', nargs = 3, type = int,
                         help = 'Which 3 columns to read, with 0 being the first. For example,'
                         ' "--cols 1 3 4" will extract the 2nd, 4th and 5th columns. Only '
                         'required when "--Format=TXT".')
    group_1.add_argument('--immortal', action = 'store_true',
                         help = 'When specified, a Numpy .npz file will be generated under the same '
                         'folder. This operation will greatly speed up data loading process next'
                         ' time.')
    group_1.add_argument('-P', '--prefix', help = 'Prefix of input .npz file name, path not'
                         ' included. Required when "--Format=NPZ".')
    group_1.add_argument('-S', '--saveto', help = 'Prefix of output .npz file name, path '
                         'not included. Required with "--immortal".')
                     
    ## Second one, load TAD data
    group_2 = parser.add_argument_group(title = 'TAD Loading:', description = 'Your file should '
                                        'contain 3 columns indicating "chromosome name",'
                                        ' "start site" and "end site", respectively.')
    group_2.add_argument('-s', '--source', help = 'Complete source file name. Both absolute '
                         'and relative path are supported.')
    group_2.add_argument('--chromname', help = 'Template of chromosome names')
    
    ## About the algorithm
    group_3 = parser.add_argument_group(title = 'Feature calculation')
    group_3.add_argument('--pw', type = int, default = 2, help = 'Width of the interaction '
                         'region surrounding the peak. According to experience, we set it'
                         ' to 2 at 10 kb and 4 at 5 kb.')
    group_3.add_argument('--ww', type = int, default = 5, help = 'The size of the donut '
                         'sampled. Set it to 5 at 10 kb, and 7 at 5 kb.')
    group_3.add_argument('--top', type = float, default = 0.7, help = 'Parameter for noisy '
                         'interaction filtering. By default, 30 percent noisy interactions'
                         ' will be eliminated.')
    group_3.add_argument('--ratio', type = float, default = 0.05, help = 'Specifies the sample'
                         ' ratio of significant interactions for TAD.')
    group_3.add_argument('--gap', type = float, default = 0.2, help = 'Maximum gap ratio.')
    
    ## Parse the command-line arguments
    commands = sys.argv[1:]
    if not commands:
        commands.append('-h')
    args = parser.parse_args(commands)
    
    args.Format = args.Format.upper()

    ## Conflicts
    if (args.Format == 'TXT') and (not args.cols):
        parser.print_help()
        parser.exit(1, 'You have to specify "--cols" with "--Format TXT"!')
    if (args.Format == 'NPZ') and (not args.prefix):
        parser.print_help()
        parser.exit(1, 'You have to specify "--prefix" with "--Format NPZ"!')
    if args.immortal and (not args.saveto):
        parser.print_help()
        parser.exit(1, 'You have to specify "--saveto" with "--immortal" flag!')
    
    return args, commands

## Pipeline
def pipe():
    """The Main pipeline for CALFEA.
    
    """
    # Parse Arguments
    args, commands = getargs()
    # Improve the performance if you don't want to run it
    if commands[0] not in ['-h', '-v', '--help', '--version']:
        ## Root Logger Configuration
        logger = logging.getLogger()
        # Logger Level
        logger.setLevel((4 - args.verbose) * 10)
        console = logging.StreamHandler()
        filehandler = logging.handlers.RotatingFileHandler('calfea.log',
                                                           maxBytes = 30000,
                                                           backupCount = 5)
        # Set level for Handlers
        console.setLevel('INFO')
        filehandler.setLevel('DEBUG')
        # Customizing Formatter
        formatter = logging.Formatter(fmt = '%(name)-14s %(levelname)-7s @ %(asctime)s: %(message)s',
                                      datefmt = '%m/%d/%y %H:%M:%S')
        ## Unified Formatter
        console.setFormatter(formatter)
        filehandler.setFormatter(formatter)
        # Add Handlers
        logger.addHandler(console)
        logger.addHandler(filehandler)
        ## Logging for argument setting
        arglist = ['# ARGUMENT LIST:',
                   '# output file name = %s' % args.filename,
                   '# data folder = %s' % args.path,
                   '# Hi-C format = %s' % args.Format,
                   '# chromosomes = %s' % args.chroms,
                   '# data resolution = %s' % args.resolution,
                   '# TAD source file = %s' % args.source,
                   '# chromosome name template = %s' % args.source,
                   '# Peak window width = %s' % args.pw,
                   '# Donut size = %s' % args.ww,
                   '# Noise filtering ratio = %s' % (1 - args.top),
                   '# Significant interaction ratio = %s' % args.ratio,
                   '# Maximum gap ratio = %s' % args.gap
                   ]
        argtxt = '\n'.join(arglist)
        logger.info('\n' + argtxt)
    
        # Interface for Inters and Queue Object
        dicts = {'path': args.path, 'Format': args.Format, 'resolution': args.resolution,
                 'template': args.template, 'chroms': args.chroms, 'cols': args.cols,
                 'prefix': args.prefix, 'immortal': args.immortal, 'saveto': args.saveto}
                 
        from tadlib import analyze
        
        logger.info('Read Hi-C data ...')
        # Inters Object, Load Hi-C Data
        workInters = analyze.Inters(**dicts)
        logger.info('Done!')
        # TAD Object, Load External TAD File, Columns 0,1,2
        logger.info('Read external TAD data ...')
        workTAD = analyze.TAD(source = args.source)
        logger.info('Done!')
        # Interaction Data
        Idata = workInters.data
        # Chromosome Labels
        chroms = workInters.labels
        # TAD Data
        Tdata = workTAD.data
    
        # Header
        H = ['ChromID', 'Start', 'End', 'AP', 'Gap-Ratio']
        args.filename.write('\t'.join(H) + '\n')
        logger.info('Calculate feature for each TAD ...')
        # Loop for Each Chromosome and Each TAD
        for c in chroms:
            logger.info('Chromosome %s', c)
            # Extract Data of One Chromosome
            i_data = Idata[c]
            d_data = Tdata[Tdata['chr'] == c]
            for d in d_data:
                ## Boundary of Current TAD
                left = d['start'] // args.resolution
                right = d['end'] // args.resolution
                # Interaction Matrix
                matrix = analyze.getmatrix(i_data, left, right)
                
                newM, convert = analyze.manipulation(matrix)
                if len(matrix) > 0:
                    # Ratio of Gaps (Vacant Rows or Columns)
                    gaps = 1 - len(newM) / len(matrix)
                else:
                    gaps = 1.0
                
                # Logical Condition
                if (gaps < args.gap) and (newM.shape[0] > (args.ww * 2 + 1)):
                    # Core Object for Interaction Analysis
                    workCore = analyze.Core(matrix)
                    # Extract Long-Range Interactions
                    workCore.longrange(pw = args.pw, ww = args.ww, top = args.top, ratio = args.ratio)
                    # Natural Clusters
                    workCore.DBSCAN()
                    # Feature
                    workCore.gdensity()
                    # Line by Line
                    wL = [c, str(d['start']), str(d['end']), str(workCore.gden), str(gaps)]
                else:
                    # Bad Domain!
                    wL = [c, str(d['start']), str(d['end']), '-1', str(gaps)]
                args.filename.write('\t'.join(wL) + '\n')
        logger.info('Done!')
        logger.info('Write results to %s ...', args.filename)
        args.filename.flush()
        args.filename.close()
        logger.info('Done!\n')

if __name__ == '__main__':
    pipe()
