#!/usr/bin/env python

import sys
import argparse

import json

from hocr.searching import hocr_load_lookup_table
from hocr.util import open_if_required
from hocr.fts import find_matches


def process_file(hocrfile, textfile, tablepath, es_workaround, pre_tag, post_tag):
    lookup_table = hocr_load_lookup_table(tablepath)

    hocrfp = open_if_required(hocrfile)
    textfp = open_if_required(textfile)
    text = textfp.read().decode('utf-8')

    for word_results in find_matches(lookup_table, hocrfp,
                                     text, es_whitespace_fixup_required=es_workaround,
                                     pre_tag=pre_tag, post_tag=post_tag):
        json.dump(word_results, sys.stdout)
        sys.stdout.write('\n')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR lookup table '
                                                 'paragraph-level validator')
    parser.add_argument('--hocr', help='hOCR Filename to read',
                        type=str, default=None)
    parser.add_argument('--annotated-text', help='Annotated fulltext filename '
                                                 'to read', type=str,
                                                 default=None)
    parser.add_argument('--table', help='Table to use',
                        type=str, default=None)
    parser.add_argument('--pre-tag', help='Highlighting pretags, normally {{{', type=str, default='{{{')
    parser.add_argument('--post-tag', help='Highlighting posttags, normally }}}', type=str, default='}}}')
    parser.add_argument('--es-workaround', help='Flag to enable working around'
                        'ES stripping leading whitespace',
                        default=False, action='store_true')
    args = parser.parse_args()

    process_file(args.hocr, args.annotated_text, args.table, args.es_workaround,
                 args.pre_tag, args.post_tag)
