#!/usr/bin/env python

import sys
import argparse
import re
from glob import glob

from hocr.parse import hocr_page_iterator, hocr_page_get_scan_res, \
        X_WCONF_REGEX, X_FSIZE_REGEX
from hocr.util import get_header_footer, elem_tostring, register_and_nuke_xhtml_namespace



# XXX: Fine tune these values
AVERAGE_LINE_CONF_FILTER = 75
SINGLE_WORD_CONF_FILTER = 75
SINGLE_CHAR_CONF_THRESHOLD = 90
AVERAGE_CHAR_CONF_THRESHOLD = 95
CHAR_CONFIDENCE_MULTIPLIER_THRESHOLD = 60
LOW_CONF_CHARS_FOR_WORD_PERCENTAGE_THRESHOLD = 0.2
BIG_WORD_FONT_SIZE = None

X_CONF_REGEX = re.compile(r'x_conf\s+([\d\.\-,]+){1}')


def _get_fsize(wordelem):
    f_sizeraw = X_FSIZE_REGEX.search(wordelem.attrib['title'])
    if f_sizeraw:
        x_fsize = float(f_sizeraw.group(1))
        return x_fsize

    return None


def process_files(files_to_process, avg_line_thres, single_word_thres,
                  single_char_confidence,
                  average_char_conf_thres, char_conf_multiplier_thres,
                  low_char_conf_perc_thres,
                  filter_big_text_fontsize):
    top, bottom = get_header_footer(files_to_process[0])

    sys.stdout.buffer.write(top)

    page_no = 0

    for f in files_to_process:
        for page in hocr_page_iterator(f):
            page.tag = 'div'

            hocr_page_dpi = hocr_page_get_scan_res(page)[1]
            if hocr_page_dpi is not None:
                page_scaler = 1.
            else:
                # Disable the font checks completely if we don't have a known
                # dpi, as this metric is very unreliable in this case
                # If we can get the dpi another way, set it to 72 / dpi, since
                # we want to *decrease* the font size if Tesseract assumed 72
                # when in reality the dpi was higher
                page_scaler = None

            blocks = page.findall("*[@class='ocr_carea']")
            for block in blocks:
                paragraphs_removed = False

                paragraphs = block.findall("*[@class='ocr_par']")
                for par in paragraphs:
                    lines_removed = False

                    for line in list(par):
                        words_removed = 0

                        words = line.findall("*[@class='ocrx_word']")
                        confs = []

                        one_word_big = False

                        # Get confidence per word
                        for word in words:
                            conf = float(X_WCONF_REGEX.search(word.attrib['title']).group(1).split()[0])
                            confs.append(conf)

                            if filter_big_text_fontsize and page_scaler:
                                word_fsize = _get_fsize(word) * page_scaler
                                one_word_big = word_fsize > filter_big_text_fontsize

                        if one_word_big or (sum(confs) / len(confs) < avg_line_thres):
                            for word in words:
                                wconf = float(X_WCONF_REGEX.search(word.attrib['title']).group(1).split()[0])
                                if one_word_big or (wconf < single_word_thres):
                                    chars = []

                                    lower_than_thres_count = 0
                                    c = 100.
                                    text = ''
                                    word_unrealistically_big = False

                                    if filter_big_text_fontsize and page_scaler:
                                        word_fsize = _get_fsize(word) * page_scaler
                                        if word_fsize > filter_big_text_fontsize:
                                            word_unrealistically_big = True

                                    # Confidence per chars
                                    for char in word.findall('.//*[@class="ocrx_cinfo"]'):
                                        conf = float(X_CONF_REGEX.search(char.attrib['title']).group(1).split()[0])
                                        if conf < single_char_confidence:
                                            lower_than_thres_count += 1
                                        c = c * (conf / 100)
                                        chars.append(conf)
                                        text += char.text

                                    perc = lower_than_thres_count / len(chars)
                                    avg_conf = sum(chars) / len(chars)

                                    if (perc > low_char_conf_perc_thres) \
                                    or (avg_conf < average_char_conf_thres) \
                                    or (c < char_conf_multiplier_thres) \
                                    or (word_unrealistically_big and wconf < 85) \
                                    or (word_unrealistically_big and len(chars) < 3):
                                        word.getparent().remove(word)

                                        words_removed += 1

                        if words_removed:
                            curwords = line.findall("*[@class='ocrx_word']")
                            percremoved = words_removed / len(words)

                            # TODO: turn these into parameters too?
                            if (len(words) >= 5 and percremoved > 0.5) or len(curwords) == 0:
                                line.getparent().remove(line)

                                lines_removed = True

                    if lines_removed:
                        lines = list(par)
                        if len(lines) == 0:
                            par.getparent().remove(par)

                            paragraphs_removed = True

                if paragraphs_removed:
                    paragraphs = block.findall("*[@class='ocr_par']")
                    if len(paragraphs) == 0:
                        block.getparent().remove(block)


            page_no += 1

            s = elem_tostring(page).decode('utf-8')
            # Let's Remove the xmlns in the div.
            # Yes, this is horrible, but cleanup_namespaces doesn't help
            # since as far as tostring knows, this is the root.
            # Let's also add two spaces for indentation for the first
            # page, and one for all the other pages.
            if page_no == 1:
                s = '  ' + \
                    s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
            else:
                s = ' ' + \
                    s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')

            s = s.encode('utf-8')
            sys.stdout.buffer.write(s)

    sys.stdout.buffer.write(bottom)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Filter hOCR files by word confidence')
    parser.add_argument('-g', '--glob',
                        help='Glob of files to parse',
                        type=str, default=None)
    parser.add_argument('--line-confidence',
                        help='Average line confidence to initiate filtering '
                        '0 implies no filtering. '
                        'Default is %d' % AVERAGE_LINE_CONF_FILTER,
                        type=int, default=AVERAGE_LINE_CONF_FILTER)
    parser.add_argument('--single-word-confidence',
                        help='Single word confidence threshold to warrant '
                        'potential removal. 0 implies no filtering. '
                        'Default is %d' % SINGLE_WORD_CONF_FILTER,
                        type=int, default=SINGLE_WORD_CONF_FILTER)
    parser.add_argument('--low-conf-chars-percentage',
                        help='Percentage of low confidence characters in a '
                        'word to warrant potential removal, between '
                        '0.0 and 1.0. Default is %f' % LOW_CONF_CHARS_FOR_WORD_PERCENTAGE_THRESHOLD,
                        type=float, default=LOW_CONF_CHARS_FOR_WORD_PERCENTAGE_THRESHOLD)
    parser.add_argument('--single-char-confidence',
                        help='Single chararacter confidence threshold for a '
                        'character to be marked as low confidence with '
                        '--low-conf-chars-percentage and '
                        '--char-conf-multiplier-threshold ; '
                        '0 means nothing is suspicious. '
                        'Default is %d' % SINGLE_CHAR_CONF_THRESHOLD,
                        type=int, default=SINGLE_CHAR_CONF_THRESHOLD)
    parser.add_argument('--char-conf-multiplier-threshold',
                        help='Multiplied chararacter confidence threshold '
                        '(per word), 0 implies that this is to be ignored. '
                        'Default is %d' % CHAR_CONFIDENCE_MULTIPLIER_THRESHOLD,
                        type=int, default=CHAR_CONFIDENCE_MULTIPLIER_THRESHOLD)
    parser.add_argument('--average-character-confidence-threshold',
                        help='Average character confidence threshold to '
                        'warrant removal of a word. 0 implies that this is '
                        'to be ignored. Default is %d' % AVERAGE_CHAR_CONF_THRESHOLD,
                        type=int, default=AVERAGE_CHAR_CONF_THRESHOLD)
    parser.add_argument('--filter-big-text-fontsize',
                        help='Filter of really big characters/words'
                        'This should be used only on the second pass meant to '
                        'find stray text, as this step can filter out noise found '
                        'in some of those steps. Default is no filtering, '
                        '30 could be sensible value. If the hOCR page has no '
                        'scan_res/dpi, this check is not active',
                        type=int, default=BIG_WORD_FONT_SIZE)

    args = parser.parse_args()

    files_to_process = glob(args.glob)

    if not len(files_to_process):
        print('No files to process!', file=sys.stderr)
        sys.exit(1)

    files_to_process = sorted(files_to_process)

    register_and_nuke_xhtml_namespace()
    process_files(files_to_process, args.line_confidence,
                  args.single_word_confidence, args.single_char_confidence,
                  args.average_character_confidence_threshold,
                  args.char_conf_multiplier_threshold,
                  args.low_conf_chars_percentage,
                  args.filter_big_text_fontsize)

