#!/usr/bin/env python

import argparse
import os
import shutil
import subprocess
import tarfile
import zipfile
import html

from collections import OrderedDict
from typing import List, Optional

from ebooklib import epub
from PIL import Image

import hocr.parse
import hocr.util

try:
    from derivermodule.metadata import parse_item_metadata
    from derivermodule.scandata import (scandata_get_pagetype_pages,
                                        scandata_get_skip_pages,
                                        scandata_parse)
except:
    # This is ok, just don't support _meta.xml and _scandata.xml
    # Just error later on when/if the files are actually being passed
    parse_item_metadata         = None
    scandata_get_skip_pages     = None
    scandata_get_pagetype_pages = None


Image.MAX_IMAGE_PIXELS = 933120000


KAKADU_ERRORS = [
    'Cannot write output with sample precision in excess of 32 bits per sample',
    'Unable to find the compositing layer identified',
]
OCR_SYSTEM_TESSERACT = 'Tesseract'
OCR_SYSTEM_ABBYY = 'ABBYY'
OCR_SYSTEM_UNKNOWN = 'Unknown'

if os.path.exists('/var/tmp/fast'):
    WORKING_DIR = '/var/tmp/fast/'
else:
    WORKING_DIR = '/tmp/'


def image_has_non_fatal_kakadu_errors(
        kakadu_error: str, known_kakadu_errors: List[str]
    ) -> bool:
    """
    Returns True if the Kakadu error should not be fatal to the overall conversion.

    Certain `.jp2` files are corrupt in various ways, and understandably Kakadu
    cannot process them. Indeed, in BookReader they will also not display.
    See, e.g., https://archive.org/details/cu31924003577214/page/n11/mode/2up.

    However, we can salvage the other pages and continue.
    """
    return any(error for error in known_kakadu_errors if error in kakadu_error)


class ImageStack(object):
    filenames = []
    images_per_page = {}
    temp_files = []
    def __init__(self,
                 image_archive_file_path: str,
                 output_basename: str,
                 use_kakadu: bool = False,
                 ignore_broken_images: bool = False):
        self.output_basename = output_basename
        self.image_archive_file_path = image_archive_file_path
        self.is_zip = self.image_archive_file_path.endswith('.zip')
        self.tempdir_zip   = os.path.join(WORKING_DIR, 'kakadu_input')
        self.tempfile_jp2  = os.path.join(WORKING_DIR, 'temp.jp2')
        self.use_kakadu = use_kakadu
        self.ignore_broken_images = ignore_broken_images
        self.parse_stack()

    def parse_stack(self):

        if self.is_zip:
            # Get all the images in the filename order (this should correspond to the page ordering)
            self.zf = zipfile.ZipFile(self.image_archive_file_path)
            for idx, img in enumerate(sorted(self.zf.namelist())):
                info = self.zf.getinfo(img)
                if info.is_dir():
                    continue
                self.filenames.append(img)
        else:
            self.zf = tarfile.open(self.image_archive_file_path)
            for idx, img in enumerate(self.zf):
                if not img.isfile():
                    continue
                self.filenames.append(img.name)
            self.filenames = sorted(self.filenames)

    def crop_image(self, page, box):
        # Keep track of the number of images cropped out from each page
        try:
            self.images_per_page[page] += 1
        except KeyError:
            self.images_per_page[page] = 0

        output_filename = "%s_%04u_%02u.jpeg" % (self.output_basename, page, self.images_per_page[page])
        #return output_filename

        from datetime import datetime
        print("%s - Cropping page %u to box %s" % (datetime.now(), page, box))

        # Extract the image from the zipfile
        tempfile_tiff = os.path.join(WORKING_DIR, 'page_%u.tiff' % page)

        if tempfile_tiff not in self.temp_files:
            extracted_file_path = os.path.join(self.tempdir_zip, self.filenames[page])

            if self.is_zip:
                self.zf.extract(self.filenames[page], self.tempdir_zip)
            else:
                os.makedirs(os.path.dirname(extracted_file_path), exist_ok=True)
                fp = self.zf.extractfile(self.filenames[page])
                f = open(extracted_file_path, 'wb+')
                f.write(fp.read())
                fp.close()
                f.close()

            # TODO: Just use Pillow here, before the rename even
            if self.use_kakadu and self.filenames[page].endswith('.jp2'):
                os.rename(extracted_file_path, self.tempfile_jp2)

                cmd = [
                    'kdu_expand',
                    '-num_threads', str(1),
                    '-i', self.tempfile_jp2,
                    '-o', tempfile_tiff
                ]

                result = subprocess.run(
                    cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
                )

                if self.ignore_broken_images and result.stderr and image_has_non_fatal_kakadu_errors(result.stderr, KAKADU_ERRORS):
                    print(f'Error in image stack for {extracted_file_path}: {result.stderr}')
                    return
                elif result.returncode != 0:
                    raise RuntimeError(
                        f"Can't convert JP2 to TIFF: {result.stderr}"
                    )

            else:
                i = Image.open(extracted_file_path)
                i.save(tempfile_tiff, compression=None)
                del i
                os.remove(extracted_file_path)

            # Keep track of the temp files so we can delete them later
            self.temp_files.append(tempfile_tiff)

        self.temp_files.append(output_filename)

        #fh = self.zf.open(self.filenames[page])
        img = Image.open(tempfile_tiff)
        if hocr.util.needs_grayscale_conversion(image=img):
            img = img.convert('L')
        elif hocr.util.needs_rgb_conversion(image=img):
            img = img.convert('RGB')

        #img = Image.open(fh)
        if box:
            region = img.crop(box)
            region.save(output_filename)
        else:
            img.save(output_filename)
        #fh.close()
        img.close()
        return output_filename

    def __del__(self):
        # Close zipfile
        try:
            self.zf.close()
        except:
            pass
        # Clean up temporary files
        try:
            shutil.rmtree(self.tempdir_zip)
            if self.tempfile_jp2 and os.path.exists(self.tempfile_jp2):
                os.unlink(self.tempfile_jp2)
            for tempfile in self.temp_files:
                os.unlink(tempfile)
        except:
            pass


class EpubGenerator(object):

    __version__ = '1.0.0'

    front_matter = (
        '<div class="offset">'
        '<p dir="ltr">This book was produced in EPUB format by the '
        'Internet Archive.</p> '
        '<p dir="ltr">The book pages were scanned and converted to EPUB '
        'format automatically. This process relies on optical character '
        'recognition, and is somewhat susceptible to errors. The book may '
        'not offer the correct reading sequence, and there may be '
        'weird characters, non-words, and incorrect guesses at '
        'structure. Some page numbers and headers or footers may remain '
        'from the scanned page. The process which identifies images might '
        'have found stray marks on the page which are not actually images '
        'from the book. The hidden page numbering which may be available '
        'to your ereader corresponds to the numbered pages in the print '
        'edition, but is not an exact match;  page numbers will increment '
        'at the same rate as the corresponding print edition, but we may '
        'have started numbering before the print book\'s visible page '
        'numbers.  The Internet Archive is working to improve the '
        'scanning process and resulting books, but in the meantime, we '
        'hope that this book will be useful to you.</p> '
        '<p dir="ltr">The Internet Archive was founded in 1996 to build '
        'an Internet library and to promote universal access to all '
        'knowledge. The Archive\'s purposes include offering permanent '
        'access for researchers, historians, scholars, people with '
        'disabilities, and ' 'the general public to historical '
        'collections that exist in digital format. The Internet Archive '
        'includes texts, audio, moving images, '
        'and software as well as archived web pages, and provides '
        'specialized services for information access for the blind and '
        'other persons with disabilities.</p>'
        '<p>Created with hocr-to-epub (v.%s)</p></div>'
    ) % __version__

    # define CSS style
    style = """
        .center {text-align: center}
        .sr-only {
            width: 1px;
            height: 1px;
            padding: 0;
            margin: -1px;
            overflow: hidden;
            clip: rect(0,0,0,0);
            border: 0;
        }
        .strong {font-weight: bold;}
        .italic {font-style: italic;}
        .serif {font-family: serif;}
        .sans {font-family: sans-serif;}
        .big {font-size: 1.5em;}
        .small {font-size: .75em;}
        .offset {
            margin: 1em;
            padding: 1.5em;
            border: black 1px solid;
        }
        img {
            padding: 0;
            margin: 0;
            max-width: 100%;
            max-height: 100%;
            column-count: 1;
            break-inside: avoid;
            oeb-column-number: 1;
        }
        p {
            text-indent: 4em;
        }
        """

    strip_whitespaces = True

    def __init__(self,
                 hocr_xml_file_path: str,
                 meta_xml_file_path: Optional[str] = None,
                 image_stack_zip_file_path: Optional[str] = None,
                 scandata_xml_file_path: Optional[str] = None,
                 epub_zip_file_path: Optional[str] = None,
                 use_kakadu: bool = False,
                 ignore_broken_images: bool = False):

        # Copy arguments to locals
        self.hocr_xml_file_path = hocr_xml_file_path
        self.meta_xml_file_path = meta_xml_file_path
        self.image_stack_zip_file_path = image_stack_zip_file_path
        self.scandata_xml_file_path = scandata_xml_file_path
        self.epub_zip_file_path = epub_zip_file_path

        # Set sensible defaults for arguments that weren't provided
        if not self.meta_xml_file_path:
            self.meta_xml_file_path = self.hocr_xml_file_path.replace('_hocr.html', '_meta.xml')
        if not self.image_stack_zip_file_path:
            self.image_stack_zip_file_path = self.hocr_xml_file_path.replace('_hocr.html', '_jp2.zip')
        if not self.scandata_xml_file_path:
            self.scandata_xml_file_path = self.hocr_xml_file_path.replace('_hocr.html', '_scandata.xml')
        if not self.epub_zip_file_path:
            self.epub_zip_file_path = self.hocr_xml_file_path.replace('_hocr.html', '_ebook.epub')

        # We can still make an epub without any of these if we must
        # Try to find metadata
        if os.path.exists(self.meta_xml_file_path) and parse_item_metadata is not None:
            self.metadata = parse_item_metadata(self.meta_xml_file_path)
        else:
            self.metadata = {}
        # Try to find jp2.zip
        if os.path.exists(self.image_stack_zip_file_path):
            self.img_stack = ImageStack(
                    self.image_stack_zip_file_path,
                    os.path.join(WORKING_DIR,"epub_img"),
                    use_kakadu=use_kakadu,
                    ignore_broken_images=ignore_broken_images)
        else:
            self.img_stack = None
        # Try to find scandata
        if os.path.exists(self.scandata_xml_file_path) \
        and scandata_get_skip_pages is not None \
        and scandata_get_pagetype_pages is not None:
            self.scandata    = scandata_parse(self.scandata_xml_file_path)
            self.skip_pages  = scandata_get_skip_pages(self.scandata)
            self.cover_pages = scandata_get_pagetype_pages(self.scandata, 'Cover')

            # Ignore all cover pages if one cover page is detected as cloth
            # cover (in this case the other cover pages are likely also cloth
            # covers)
            sd_pages = self.scandata['book']['pageData']['page']
            if not isinstance(sd_pages, list):
                sd_pages = [sd_pages]
            for cover_page in self.cover_pages:
                if sd_pages[cover_page].get('clothCover', None) == 'true':
                    self.cover_pages = []
                    break
        else:
            self.skip_pages = []
            self.cover_pages = []

        print("Parsing file %s" % self.hocr_xml_file_path)
        self.generate()

    def set_metadata(self):
        """
        Set the metadata on the epub object
        """
        if 'identifier' in self.metadata.keys():
            self.book.set_identifier(self.metadata['identifier'])
        if 'language' in self.metadata.keys():
            if type(self.metadata['language']) is str:
                self.metadata['language'] = hocr.util.normalize_language(self.metadata['language'])
                self.book.set_language(self.metadata['language'])
            elif type(self.metadata['language']) is list:
                self.metadata['language'] = '; '.join(map(hocr.util.normalize_language, self.metadata['language']))
                self.book.set_language(self.metadata['language'])
        if 'title' in self.metadata.keys():
            if isinstance(self.metadata['title'], str):
                self.book.set_title(self.metadata['title'])
            elif isinstance(self.metadata['title'], list):
                title = ' ; '.join(self.metadata['title'])
                self.book.set_title(title)
        if 'creator' in self.metadata.keys():
            if type(self.metadata['creator']) is str:
                self.book.add_author(self.metadata['creator'])
            elif type(self.metadata['creator']) is list:
                for i, creator in enumerate(self.metadata['creator']):
                    creator_uid = 'creator_{creator_uid}'.format(creator_uid=i)
                    self.book.add_author(creator, uid=creator_uid)
        if 'description' in self.metadata.keys():
            if type(self.metadata['description']) is str:
                self.book.add_metadata('DC', 'description', self.metadata['description'])
            elif type(self.metadata['description']) is list:
                for description in self.metadata['description']:
                    self.book.add_metadata('DC', 'description', description)
        if 'publisher' in self.metadata.keys():
            if type(self.metadata['publisher']) is str:
                    self.book.add_metadata('DC', 'publisher', self.metadata['publisher'])
            elif type(self.metadata['publisher']) is list:
                for publisher in self.metadata['publisher']:
                    self.book.add_metadata('DC', 'publisher', publisher)
        if 'identifier-access' in self.metadata.keys():
            if type(self.metadata['identifier-access']) is str:
                self.book.add_metadata(
                    'DC', 'identifier', 'Access URL: {}'.format(
                        self.metadata['identifier-access']
                    )
                )
            elif type(self.metadata['identifier-access']) is list:
                for identifier_access in self.metadata['identifier-access']:
                    self.book.add_metadata(
                        'DC', 'identifier', 'Access URL: {}'.format(
                            identifier_access
                        )
                    )
        if 'identifier-ark' in self.metadata.keys():
            if type(self.metadata['identifier-ark']) is str:
                self.book.add_metadata(
                    'DC', 'identifier', 'urn:ark:{}'.format(self.metadata['identifier-ark'])
                )
            elif type(self.metadata['identifier-ark']) is list:
                for identifier_ark in self.metadata['identifier-ark']:
                    self.book.add_metadata(
                        'DC', 'identifier', 'urn:ark:{}'.format(identifier_ark)
                    )
        if 'isbn' in self.metadata.keys():
            if type(self.metadata['isbn']) is str:
                self.book.add_metadata(
                    'DC', 'identifier', 'urn:isbn:{}'.format(self.metadata['isbn'])
                )
            elif type(self.metadata['isbn']) is list:
                for isbn in self.metadata['isbn']:
                    self.book.add_metadata(
                        'DC', 'identifier', 'urn:isbn:{}'.format(isbn)
                    )
        if 'oclc-id' in self.metadata.keys():
            if type(self.metadata['oclc-id']) is str:
                self.book.add_metadata(
                    'DC', 'identifier', 'urn:oclc:{}'.format(self.metadata['oclc-id'])
                )
            elif type(self.metadata['oclc-id']) is list:
                for oclc_id in self.metadata['oclc-id']:
                    self.book.add_metadata(
                        'DC', 'identifier', 'urn:oclc:{}'.format(oclc_id)
                    )
        if 'external-identifier' in self.metadata.keys():
            if type(self.metadata['external-identifier']) is str:
                self.book.add_metadata('DC', 'identifier', self.metadata['external-identifier'])
            elif type(self.metadata['external-identifier']) is list:
                for external_identifier in self.metadata['external-identifier']:
                    self.book.add_metadata('DC', 'identifier', external_identifier)
        if 'related-external-id' in self.metadata.keys():
            if type(self.metadata['related-external-id']) is str:
                self.book.add_metadata('DC', 'identifier', self.metadata['related-external-id'])
            elif type(self.metadata['related-external-id']) is list:
                for related_external_id in self.metadata['related-external-id']:
                    self.book.add_metadata('DC', 'identifier', related_external_id)
        if 'subject' in self.metadata.keys():
            if type(self.metadata['subject']) is str:
                self.book.add_metadata('DC', 'subject', self.metadata['subject'])
            elif type(self.metadata['subject']) is list:
                for subject in self.metadata['subject']:
                    self.book.add_metadata('DC', 'subject', subject)
        if 'date' in self.metadata.keys():
            if isinstance(self.metadata['date'], str):
                self.book.add_metadata('DC', 'date', self.metadata['date'])
            elif isinstance(self.metadata['date'], list):
                for date in self.metadata['date']:
                    self.book.add_metadata('DC', 'date', date)

    def set_accessibility_metadata(self):
        summary = ''
        # Add the accessibility metadata to the publication
        summary += (
            'The publication was generated using automated character '
            'recognition, therefore it may not be an accurate rendition '
            'of the original text, and it may not offer the correct '
            'reading sequence.'
        )

        modes = []
        modes_sufficient = []
        if self.has_text:
            modes.append('textual')
            modes_sufficient.append('textual')
        if self.has_images:
            modes.append('visual')
            modes_sufficient.append('visual')
            summary += ' This publication is missing meaningful alternative text.'

        summary += ' The publication otherwise meets WCAG 2.0 Level A.'

        self.book.add_metadata(
            None,
            'meta',
            summary,
            OrderedDict([('property', 'schema:accessibilitySummary')])
        )

        for mode in modes:
            self.book.add_metadata(
                None,
                'meta',
                mode,
                OrderedDict([('property', 'schema:accessMode')])
            )

        for mode in modes_sufficient:
            self.book.add_metadata(
                None,
                'meta',
                mode,
                OrderedDict([('property', 'schema:accessModeSufficient')])
            )

        features = ['none', ]
        for feature in features:
            self.book.add_metadata(
                None,
                'meta',
                feature,
                OrderedDict([('property', 'schema:accessibilityFeature')])
            )

        # these states will be true for any static content,  which we know
        # is guaranteed for OCR generated texts.
        hazards = [
            'noFlashingHazard',
            'noMotionSimulationHazard',
            'noSoundHazard',
        ]
        controls = [
            'fullKeyboardControl',
            'fullMouseControl',
            'fullSwitchControl',
            'fullTouchControl',
            'fullVoiceControl',
        ]

        for hazard in hazards:
            self.book.add_metadata(
                None,
                'meta',
                hazard,
                OrderedDict([('property', 'schema:accessibilityHazard')])
            )
        for control in controls:
            self.book.add_metadata(
                None,
                'meta',
                control,
                OrderedDict([('property', 'schema:accessibilityControl')])
            )


    def generate(self, warn_confidence_threshold=75.0, remove_confidence_threshold=None):
        self.book = epub.EpubBook()
        self.book.reset()
        self.set_metadata()

        css_file = epub.EpubItem(
            uid="style_nav",
            file_name="style/style.css",
            media_type="text/css",
            content=self.style
        )
        self.book.add_item(css_file)

        front_matter_epub = epub.EpubHtml(title='Notice', file_name='notice.html', lang='en', media_type='application/xhtml+xml')
        front_matter_epub.set_content(self.front_matter)

        ocr_system = None

        hocr_ocr_system = hocr.util.get_ocr_system(hocr.util.open_if_required(self.hocr_xml_file_path)) or ''
        hocr_ocr_system = hocr_ocr_system.lower()

        if 'tesseract' in hocr_ocr_system:
            ocr_system = OCR_SYSTEM_TESSERACT
            warning_confidence_threshold = 50
            #remove_confidence_threshold = 30
            remove_confidence_threshold = None
        elif 'abbyy' in hocr_ocr_system or 'luratech' in hocr_ocr_system:
            ocr_system = OCR_SYSTEM_ABBYY
            warning_confidence_threshold = 30
            #remove_confidence_threshold = 20
            remove_confidence_threshold = None
        else:
            ocr_system = OCR_SYSTEM_UNKNOWN
            warning_confidence_threshold = 50
            remove_confidence_threshold = None

        print('Detected OCR system:', ocr_system)

        pages_hocr = hocr.parse.hocr_page_iterator(self.hocr_xml_file_path)
        pages_epub = []
        pages_epub.append(front_matter_epub)

        # Iterate all the pages
        images_found = 0
        words_found = 0
        for page_idx, page in enumerate(pages_hocr):
            if page_idx in self.skip_pages:
                continue
            # Get all the words on the page
            word_data = hocr.parse.hocr_page_to_word_data(page)

            # Get all the photos on the page
            photo_boxes = hocr.parse.hocr_page_to_photo_data(page)

            page_content = []
            page_confidence = 0
            words_on_page = 0
            # ABBYY converter sometimes identifies linebreaks as a negation sign
            hyphens = ['-', '¬']
            # Combine all all the words on the page
            for element in word_data:
                line_content = []
                for line in element['lines']:
                    for word in line['words']:
                        # Save text data
                        text = word['text']
                        text = html.escape(text)
                        if self.strip_whitespaces:
                            text = text.strip()
                        line_content.append(text)

                        # Count word confidence scores
                        page_confidence += word['confidence']
                        words_found += 1
                        words_on_page += 1
                    # Examine the last character of of the last element of the line
                    if len(line_content) and len(line_content[-1]) and line_content[-1][-1] in hyphens:
                        # Remove the last character if it is a hyphen
                        line_content[-1] = line_content[-1][:-1]
                        # Add placeholder value
                        line_content.append('\x7f')
                page_content += line_content

            # Flatten list into string and add spaces
            page_text = ' '.join(page_content)
            # Remove placeholder and spaces in the positions that previously had a line break hyphen
            page_text = page_text.replace(' \x7f ', '')
            # Create HTML/epub page
            page_html = u"<p>%s</p>" % page_text

            # Add a warning if the confidence in the text is below the given threshold
            if words_on_page:
                page_confidence = page_confidence/words_on_page

                if remove_confidence_threshold and page_confidence < remove_confidence_threshold:
                    print('Skipping page due to low confidence: %0.02f%%' % page_confidence)
                    continue
                if page_confidence < warning_confidence_threshold:
                    print('Warning for page due to low confidence: %.02f%%' % page_confidence)
                    page_html = (u"<b>The text on this page is estimated to be only %0.02f%% accurate</b>" % page_confidence) + page_html

            # Add all the images from the page
            images_on_page = 0
            for image_idx, box in enumerate(photo_boxes):
                # Treat first cover image as special case
                if page_idx not in self.cover_pages[:1]:
                    cropped_image_filename = self.img_stack.crop_image(page_idx, box)
                    if not cropped_image_filename:
                        continue
                    cropped_jpeg_data = open(cropped_image_filename, "rb").read()
                    image_filename_epub = "image_%04u_%02u.jpeg" % (page_idx, image_idx)
                    image_epub = epub.EpubImage()
                    image_epub.file_name = image_filename_epub
                    image_epub.media_type = "image/jpeg"
                    image_epub.set_content(cropped_jpeg_data)

                    self.book.add_item(image_epub)
                    page_html += "<img src=\"%s\" alt=\"Image %u\"/>" % (image_filename_epub, images_found)
                    images_found += 1
                    images_on_page += 1

            if words_on_page or images_on_page:
                page_epub = epub.EpubHtml(title='Page %s' % page_idx,
                                          file_name='page_%s.html' % page_idx,
                                          media_type='application/xhtml+xml',
                                          )
                if 'language' in self.metadata.keys():
                    page_epub.set_language(self.metadata['language'])
                else:
                    page_epub.set_language('None')
                page_epub.add_link(
                    href='style/style.css', rel='stylesheet', type='text/css'
                )
                page_epub.set_content(page_html)
                pages_epub.append(page_epub)

        # Apply some transformations to remove headings and page numbers
        # TODO
        #for page_epub in pages_epub:
        #    print(page_epub.get_body_content())

        # Add all the pages to the book
        for page_epub in pages_epub:
            self.book.add_item(page_epub)

        self.has_text = words_found > 0
        self.has_images = images_found > 0
        self.set_accessibility_metadata()

        # Set cover
        if len(self.cover_pages):
            image_filename = self.img_stack.crop_image(self.cover_pages[0], None)
            jpeg_data = open(image_filename, "rb").read()
            self.book.set_cover("cover.jpeg", jpeg_data)

        # We don't have enough information to create TOC/chapters/sections yet
        #book.toc = pages_epub
        self.book.toc = (front_matter_epub,)
        self.book.add_item(epub.EpubNcx())
        self.book.add_item(epub.EpubNav())
        if len(self.cover_pages):
            self.book.spine = ['cover', 'nav', ] + pages_epub
        else:
            self.book.spine = ['nav', ] + pages_epub
        epub.write_epub(self.epub_zip_file_path, self.book, {})


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR to ePUB converter')

    parser.add_argument('-f', '--infile', help='Item _hocr.html file',
                        type=str, default=None)
    parser.add_argument('-o', '--outfile', help='Output _ebook.epub file',
                        type=str, default=None)
    parser.add_argument('-m', '--metafile', help='Item _meta.xml file',
                        type=str, default=None)
    parser.add_argument('-i', '--imagestack', help='Item imagestack file (usually _jp2.zip)',
                        type=str, default=None)
    parser.add_argument('-s', '--scandata', help='Item _scandata.xml file',
                        type=str, default=None)
    parser.add_argument('-w', '--workingdir', help='Directory used for temp files',
                        type=str, default=None)
    parser.add_argument('--kakadu', help='Use kakadu is available', action='store_true')
    parser.add_argument('--ignore-broken-images', help='Continue even if kakadu cannot use an image',
                        action='store_true')
    args = parser.parse_args()

    if not args.infile:
        raise Exception("Must provide hOCR input file with -f")

    # Allow external caller to override working directory from default /tmp/ or /var/tmp/fast/
    if args.workingdir:
        WORKING_DIR = args.workingdir

    EpubGenerator(
        args.infile,
        args.metafile,
        args.imagestack,
        args.scandata,
        args.outfile,
        use_kakadu=args.kakadu,
        ignore_broken_images=args.ignore_broken_images)
