#!/usr/bin/env python

import sys
import re

import argparse

from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer, elem_tostring, register_and_nuke_xhtml_namespace

rem = re.compile("<div class=['\"]ocr_page['\"].*>")

def process_file(filepath):
    fd = open_if_required(filepath)
    top, bottom = get_header_footer(fd)

    # Idea: strip ending for first page, add ending to last page?

    first = True

    print(top.decode('utf-8'))
    for pageno, page in enumerate(hocr_page_iterator(fd)):

        s = elem_tostring(page).decode('utf-8')

        s = ' ' + s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')

        if first:
            # Remove the last </div> of the first page
            idx = s.rindex('</div>')
            s = s[:idx]
            first = False
        else:
            # XXX: This assumes that <div class="ocr_page"> is the only html
            # element on the line (!)
            s = re.sub(rem, '', s, count=1)

        print(s)

    print(bottom.decode('utf-8'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR page flatten (combine several pages into one page)')
    parser.add_argument('-f', '--infile', help='Filename to read',
                        type=str, default=None)
    args = parser.parse_args()

    register_and_nuke_xhtml_namespace()
    process_file(args.infile)
