#!/usr/bin/env python

import argparse

from xml.etree import ElementTree

from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer, \
        register_and_nuke_xhtml_namespace, elem_tostring

def process_file(filepath, outfmt):
    fd = open_if_required(filepath)
    top, bottom = get_header_footer(fd)

    for pageno, page in enumerate(hocr_page_iterator(fd)):
        fp = open(outfmt % pageno, 'bw+')

        fp.write(top)
        s = elem_tostring(page).decode('utf-8')

        s = ' ' + s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
        s = s.encode('utf-8')
        fp.write(s)

        fp.write(bottom)
        fp.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR single page extractor')
    parser.add_argument('-f', '--infile', help='Filename to read',
                        type=str, default=None)
    parser.add_argument('-o', '--out-format', help='Outfile format - make sure it '
                        'takes an int as format',
                        type=str, default=None)
    args = parser.parse_args()

    register_and_nuke_xhtml_namespace()
    process_file(args.infile, args.out_format)
