#!/usr/bin/env python

import sys
import argparse
import re
from glob import glob

from lxml import etree

from hocr.parse import hocr_page_iterator, X_WCONF_REGEX
from hocr.util import get_header_footer



# XXX: Fine tune these values
AVERAGE_LINE_CONF_FILTER = 75
SINGLE_WORD_CONF_FILTER = 75
SINGLE_CHAR_CONF_THRESHOLD = 90
AVERAGE_CHAR_CONF_THRESHOLD = 95
CHAR_CONFIDENCE_MULTIPLIER_THRESHOLD = 60
LOW_CONF_CHARS_FOR_WORD_PERCENTAGE_THRESHOLD = 0.2

X_CONF_REGEX = re.compile(r'x_conf\s+([\d\.\-,]+){1}')

def process_files(files_to_process, avg_line_thres, single_word_thres,
                  single_char_confidence,
                  average_char_conf_thres, char_conf_multiplier_thres,
                  low_char_conf_perc_thres):
    top, bottom = get_header_footer(files_to_process[0])

    sys.stdout.buffer.write(top)

    page_no = 0

    for f in files_to_process:
        for page in hocr_page_iterator(f):
            page.tag = 'div'

            blocks = page.xpath("*[@class='ocr_carea']")
            for block in blocks:
                paragraphs_removed = False

                paragraphs = block.xpath("*[@class='ocr_par']")
                for par in paragraphs:
                    lines_removed = False

                    for line in par.getchildren():
                        words_removed = 0

                        words = line.xpath("*[@class='ocrx_word']")
                        confs = []

                        # Get confidence per word
                        for word in words:
                            conf = float(X_WCONF_REGEX.search(word.attrib['title']).group(1).split()[0])
                            confs.append(conf)

                        if sum(confs) / len(confs) < avg_line_thres:
                            for word in words:
                                wconf = float(X_WCONF_REGEX.search(word.attrib['title']).group(1).split()[0])
                                if wconf < single_word_thres:
                                    chars = []

                                    lower_than_thres_count = 0
                                    c = 100.
                                    text = ''

                                    # Confidence per chars
                                    for char in word.xpath('.//*[@class="ocrx_cinfo"]'):
                                        conf = float(X_CONF_REGEX.search(char.attrib['title']).group(1).split()[0])
                                        if conf < single_char_confidence:
                                            lower_than_thres_count += 1
                                        c = c * (conf / 100)
                                        chars.append(conf)
                                        text += char.text

                                    perc = lower_than_thres_count / len(chars)
                                    avg_conf = sum(chars) / len(chars)

                                    if perc > low_char_conf_perc_thres \
                                    or avg_conf < average_char_conf_thres \
                                    or c < char_conf_multiplier_thres:
                                        word.getparent().remove(word)

                                        words_removed += 1

                        if words_removed:
                            curwords = line.xpath("*[@class='ocrx_word']")
                            percremoved = words_removed / len(words)

                            # TODO: turn these into parameters too?
                            if (len(words) >= 5 and percremoved > 0.5) or len(curwords) == 0:
                                line.getparent().remove(line)

                                lines_removed = True

                    if lines_removed:
                        lines = par.getchildren()
                        if len(lines) == 0:
                            par.getparent().remove(par)

                            paragraphs_removed = True

                if paragraphs_removed:
                    paragraphs = block.xpath("*[@class='ocr_par']")
                    if len(paragraphs) == 0:
                        block.getparent().remove(block)


            page_no += 1

            s = etree.tostring(page, pretty_print=True, method='xml',
                               encoding='utf-8').decode('utf-8')
            # Let's Remove the xmlns in the div.
            # Yes, this is horrible, but cleanup_namespaces doesn't help
            # since as far as tostring knows, this is the root.
            # Let's also add two spaces for indentation for the first
            # page, and one for all the other pages.
            if page_no == 1:
                s = '  ' + \
                    s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
            else:
                s = ' ' + \
                    s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')

            s = s.encode('utf-8')
            sys.stdout.buffer.write(s)

    sys.stdout.buffer.write(bottom)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Filter hOCR files by word confidence')
    parser.add_argument('-g', '--glob',
                        help='Glob of files to parse',
                        type=str, default=None)
    parser.add_argument('--line-confidence',
                        help='Average line confidence to initiate filtering '
                        '0 implies no filtering. '
                        'Default is %d' % AVERAGE_LINE_CONF_FILTER,
                        type=int, default=AVERAGE_LINE_CONF_FILTER)
    parser.add_argument('--single-word-confidence',
                        help='Single word confidence threshold to warrant '
                        'potential removal. 0 implies no filtering. '
                        'Default is %d' % SINGLE_WORD_CONF_FILTER,
                        type=int, default=SINGLE_WORD_CONF_FILTER)
    parser.add_argument('--low-conf-chars-percentage',
                        help='Percentage of low confidence characters in a '
                        'word to warrant potential removal, between '
                        '0.0 and 1.0. Default is %f' % LOW_CONF_CHARS_FOR_WORD_PERCENTAGE_THRESHOLD,
                        type=float, default=LOW_CONF_CHARS_FOR_WORD_PERCENTAGE_THRESHOLD)
    parser.add_argument('--single-char-confidence',
                        help='Single chararacter confidence threshold for a '
                        'character to be marked as low confidence with '
                        '--low-conf-chars-percentage and '
                        '--char-conf-multiplier-threshold ; '
                        '0 means nothing is suspicious. '
                        'Default is %d' % SINGLE_CHAR_CONF_THRESHOLD,
                        type=int, default=SINGLE_CHAR_CONF_THRESHOLD)
    parser.add_argument('--char-conf-multiplier-threshold',
                        help='Multiplied chararacter confidence threshold '
                        '(per word), 0 implies that this is to be ignored. '
                        'Default is %d' % CHAR_CONFIDENCE_MULTIPLIER_THRESHOLD,
                        type=int, default=CHAR_CONFIDENCE_MULTIPLIER_THRESHOLD)
    parser.add_argument('--average-character-confidence-threshold',
                        help='Average character confidence threshold to '
                        'warrant removal of a word. 0 implies that this is '
                        'to be ignored. Default is %d' % AVERAGE_CHAR_CONF_THRESHOLD,
                        type=int, default=AVERAGE_CHAR_CONF_THRESHOLD)
    
    args = parser.parse_args()

    files_to_process = glob(args.glob)

    if not len(files_to_process):
        print('No files to process!', file=sys.stderr)
        sys.exit(1)

    files_to_process = sorted(files_to_process)

    process_files(files_to_process, args.line_confidence,
                  args.single_word_confidence, args.single_char_confidence,
                  args.average_character_confidence_threshold,
                  args.char_conf_multiplier_threshold,
                  args.low_conf_chars_percentage)

