tesseract  3.05.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
int imagenum () const
 

Protected Member Functions

virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)
 
virtual bool EndDocumentHandler ()
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 185 of file renderer.h.

Constructor & Destructor Documentation

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir 
)

Definition at line 181 of file pdfrenderer.cpp.

182  : TessResultRenderer(outputbase, "pdf") {
183  obj_ = 0;
184  datadir_ = datadir;
185  offsets_.push_back(0);
186 }
int push_back(T object)
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:32

Member Function Documentation

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
protectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 837 of file pdfrenderer.cpp.

837  {
838  size_t n;
839  char buf[kBasicBufSize];
840  Pix *pix = api->GetInputImage();
841  char *filename = (char *)api->GetInputName();
842  int ppi = api->GetSourceYResolution();
843  if (!pix || ppi <= 0)
844  return false;
845  double width = pixGetWidth(pix) * 72.0 / ppi;
846  double height = pixGetHeight(pix) * 72.0 / ppi;
847 
848  // PAGE
849  n = snprintf(buf, sizeof(buf),
850  "%ld 0 obj\n"
851  "<<\n"
852  " /Type /Page\n"
853  " /Parent %ld 0 R\n"
854  " /MediaBox [0 0 %.2f %.2f]\n"
855  " /Contents %ld 0 R\n"
856  " /Resources\n"
857  " <<\n"
858  " /XObject << /Im1 %ld 0 R >>\n"
859  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
860  " /Font << /f-0-0 %ld 0 R >>\n"
861  " >>\n"
862  ">>\n"
863  "endobj\n",
864  obj_,
865  2L, // Pages object
866  width,
867  height,
868  obj_ + 1, // Contents object
869  obj_ + 2, // Image object
870  3L); // Type0 Font
871  if (n >= sizeof(buf)) return false;
872  pages_.push_back(obj_);
873  AppendPDFObject(buf);
874 
875  // CONTENTS
876  char* pdftext = GetPDFTextObjects(api, width, height);
877  long pdftext_len = strlen(pdftext);
878  unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
879  size_t len;
880  unsigned char *comp_pdftext =
881  zlibCompress(pdftext_casted, pdftext_len, &len);
882  long comp_pdftext_len = len;
883  n = snprintf(buf, sizeof(buf),
884  "%ld 0 obj\n"
885  "<<\n"
886  " /Length %ld /Filter /FlateDecode\n"
887  ">>\n"
888  "stream\n", obj_, comp_pdftext_len);
889  if (n >= sizeof(buf)) {
890  delete[] pdftext;
891  lept_free(comp_pdftext);
892  return false;
893  }
894  AppendString(buf);
895  long objsize = strlen(buf);
896  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
897  objsize += comp_pdftext_len;
898  lept_free(comp_pdftext);
899  delete[] pdftext;
900  const char *b2 =
901  "endstream\n"
902  "endobj\n";
903  AppendString(b2);
904  objsize += strlen(b2);
905  AppendPDFObjectDIY(objsize);
906 
907  char *pdf_object;
908  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
909  return false;
910  }
911  AppendData(pdf_object, objsize);
912  AppendPDFObjectDIY(objsize);
913  delete[] pdf_object;
914  return true;
915 }
void AppendString(const char *s)
Definition: renderer.cpp:99
const int kBasicBufSize
int push_back(T object)
void AppendData(const char *s, int len)
Definition: renderer.cpp:103
bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 485 of file pdfrenderer.cpp.

485  {
486  char buf[kBasicBufSize];
487  size_t n;
488 
489  n = snprintf(buf, sizeof(buf),
490  "%%PDF-1.5\n"
491  "%%%c%c%c%c\n",
492  0xDE, 0xAD, 0xBE, 0xEB);
493  if (n >= sizeof(buf)) return false;
494  AppendPDFObject(buf);
495 
496  // CATALOG
497  n = snprintf(buf, sizeof(buf),
498  "1 0 obj\n"
499  "<<\n"
500  " /Type /Catalog\n"
501  " /Pages %ld 0 R\n"
502  ">>\n"
503  "endobj\n",
504  2L);
505  if (n >= sizeof(buf)) return false;
506  AppendPDFObject(buf);
507 
508  // We are reserving object #2 for the /Pages
509  // object, which I am going to create and write
510  // at the end of the PDF file.
511  AppendPDFObject("");
512 
513  // TYPE0 FONT
514  n = snprintf(buf, sizeof(buf),
515  "3 0 obj\n"
516  "<<\n"
517  " /BaseFont /GlyphLessFont\n"
518  " /DescendantFonts [ %ld 0 R ]\n"
519  " /Encoding /Identity-H\n"
520  " /Subtype /Type0\n"
521  " /ToUnicode %ld 0 R\n"
522  " /Type /Font\n"
523  ">>\n"
524  "endobj\n",
525  4L, // CIDFontType2 font
526  6L // ToUnicode
527  );
528  if (n >= sizeof(buf)) return false;
529  AppendPDFObject(buf);
530 
531  // CIDFONTTYPE2
532  n = snprintf(buf, sizeof(buf),
533  "4 0 obj\n"
534  "<<\n"
535  " /BaseFont /GlyphLessFont\n"
536  " /CIDToGIDMap %ld 0 R\n"
537  " /CIDSystemInfo\n"
538  " <<\n"
539  " /Ordering (Identity)\n"
540  " /Registry (Adobe)\n"
541  " /Supplement 0\n"
542  " >>\n"
543  " /FontDescriptor %ld 0 R\n"
544  " /Subtype /CIDFontType2\n"
545  " /Type /Font\n"
546  " /DW %d\n"
547  ">>\n"
548  "endobj\n",
549  5L, // CIDToGIDMap
550  7L, // Font descriptor
551  1000 / kCharWidth);
552  if (n >= sizeof(buf)) return false;
553  AppendPDFObject(buf);
554 
555  // CIDTOGIDMAP
556  const int kCIDToGIDMapSize = 2 * (1 << 16);
557  unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
558  for (int i = 0; i < kCIDToGIDMapSize; i++) {
559  cidtogidmap[i] = (i % 2) ? 1 : 0;
560  }
561  size_t len;
562  unsigned char *comp =
563  zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
564  delete[] cidtogidmap;
565  n = snprintf(buf, sizeof(buf),
566  "5 0 obj\n"
567  "<<\n"
568  " /Length %lu /Filter /FlateDecode\n"
569  ">>\n"
570  "stream\n",
571  (unsigned long)len);
572  if (n >= sizeof(buf)) {
573  lept_free(comp);
574  return false;
575  }
576  AppendString(buf);
577  long objsize = strlen(buf);
578  AppendData(reinterpret_cast<char *>(comp), len);
579  objsize += len;
580  lept_free(comp);
581  const char *endstream_endobj =
582  "endstream\n"
583  "endobj\n";
584  AppendString(endstream_endobj);
585  objsize += strlen(endstream_endobj);
586  AppendPDFObjectDIY(objsize);
587 
588  const char *stream =
589  "/CIDInit /ProcSet findresource begin\n"
590  "12 dict begin\n"
591  "begincmap\n"
592  "/CIDSystemInfo\n"
593  "<<\n"
594  " /Registry (Adobe)\n"
595  " /Ordering (UCS)\n"
596  " /Supplement 0\n"
597  ">> def\n"
598  "/CMapName /Adobe-Identify-UCS def\n"
599  "/CMapType 2 def\n"
600  "1 begincodespacerange\n"
601  "<0000> <FFFF>\n"
602  "endcodespacerange\n"
603  "1 beginbfrange\n"
604  "<0000> <FFFF> <0000>\n"
605  "endbfrange\n"
606  "endcmap\n"
607  "CMapName currentdict /CMap defineresource pop\n"
608  "end\n"
609  "end\n";
610 
611  // TOUNICODE
612  n = snprintf(buf, sizeof(buf),
613  "6 0 obj\n"
614  "<< /Length %lu >>\n"
615  "stream\n"
616  "%s"
617  "endstream\n"
618  "endobj\n", (unsigned long) strlen(stream), stream);
619  if (n >= sizeof(buf)) return false;
620  AppendPDFObject(buf);
621 
622  // FONT DESCRIPTOR
623  n = snprintf(buf, sizeof(buf),
624  "7 0 obj\n"
625  "<<\n"
626  " /Ascent %d\n"
627  " /CapHeight %d\n"
628  " /Descent -1\n" // Spec says must be negative
629  " /Flags 5\n" // FixedPitch + Symbolic
630  " /FontBBox [ 0 0 %d %d ]\n"
631  " /FontFile2 %ld 0 R\n"
632  " /FontName /GlyphLessFont\n"
633  " /ItalicAngle 0\n"
634  " /StemV 80\n"
635  " /Type /FontDescriptor\n"
636  ">>\n"
637  "endobj\n",
638  1000,
639  1000,
640  1000 / kCharWidth,
641  1000,
642  8L // Font data
643  );
644  if (n >= sizeof(buf)) return false;
645  AppendPDFObject(buf);
646 
647  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
648  if (n >= sizeof(buf)) return false;
649  FILE *fp = fopen(buf, "rb");
650  if (!fp) {
651  tprintf("Can not open file \"%s\"!\n", buf);
652  return false;
653  }
654  fseek(fp, 0, SEEK_END);
655  long int size = ftell(fp);
656  fseek(fp, 0, SEEK_SET);
657  char *buffer = new char[size];
658  if (fread(buffer, 1, size, fp) != size) {
659  fclose(fp);
660  delete[] buffer;
661  return false;
662  }
663  fclose(fp);
664  // FONTFILE2
665  n = snprintf(buf, sizeof(buf),
666  "8 0 obj\n"
667  "<<\n"
668  " /Length %ld\n"
669  " /Length1 %ld\n"
670  ">>\n"
671  "stream\n", size, size);
672  if (n >= sizeof(buf)) {
673  delete[] buffer;
674  return false;
675  }
676  AppendString(buf);
677  objsize = strlen(buf);
678  AppendData(buffer, size);
679  delete[] buffer;
680  objsize += size;
681  AppendString(endstream_endobj);
682  objsize += strlen(endstream_endobj);
683  AppendPDFObjectDIY(objsize);
684  return true;
685 }
void AppendString(const char *s)
Definition: renderer.cpp:99
const int kCharWidth
#define tprintf(...)
Definition: tprintf.h:31
const int kBasicBufSize
void AppendData(const char *s, int len)
Definition: renderer.cpp:103
bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 918 of file pdfrenderer.cpp.

918  {
919  size_t n;
920  char buf[kBasicBufSize];
921 
922  // We reserved the /Pages object number early, so that the /Page
923  // objects could refer to their parent. We finally have enough
924  // information to go fill it in. Using lower level calls to manipulate
925  // the offset record in two spots, because we are placing objects
926  // out of order in the file.
927 
928  // PAGES
929  const long int kPagesObjectNumber = 2;
930  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
931  n = snprintf(buf, sizeof(buf),
932  "%ld 0 obj\n"
933  "<<\n"
934  " /Type /Pages\n"
935  " /Kids [ ", kPagesObjectNumber);
936  if (n >= sizeof(buf)) return false;
937  AppendString(buf);
938  size_t pages_objsize = strlen(buf);
939  for (size_t i = 0; i < pages_.size(); i++) {
940  n = snprintf(buf, sizeof(buf),
941  "%ld 0 R ", pages_[i]);
942  if (n >= sizeof(buf)) return false;
943  AppendString(buf);
944  pages_objsize += strlen(buf);
945  }
946  n = snprintf(buf, sizeof(buf),
947  "]\n"
948  " /Count %d\n"
949  ">>\n"
950  "endobj\n", pages_.size());
951  if (n >= sizeof(buf)) return false;
952  AppendString(buf);
953  pages_objsize += strlen(buf);
954  offsets_.back() += pages_objsize; // manipulation #2
955 
956  // INFO
957  char* datestr = l_getFormattedDate();
958  n = snprintf(buf, sizeof(buf),
959  "%ld 0 obj\n"
960  "<<\n"
961  " /Producer (Tesseract %s)\n"
962  " /CreationDate (D:%s)\n"
963  " /Title (%s)"
964  ">>\n"
965  "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
966  lept_free(datestr);
967  if (n >= sizeof(buf)) return false;
968  AppendPDFObject(buf);
969  n = snprintf(buf, sizeof(buf),
970  "xref\n"
971  "0 %ld\n"
972  "0000000000 65535 f \n", obj_);
973  if (n >= sizeof(buf)) return false;
974  AppendString(buf);
975  for (int i = 1; i < obj_; i++) {
976  n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
977  if (n >= sizeof(buf)) return false;
978  AppendString(buf);
979  }
980  n = snprintf(buf, sizeof(buf),
981  "trailer\n"
982  "<<\n"
983  " /Size %ld\n"
984  " /Root %ld 0 R\n"
985  " /Info %ld 0 R\n"
986  ">>\n"
987  "startxref\n"
988  "%ld\n"
989  "%%%%EOF\n",
990  obj_,
991  1L, // catalog
992  obj_ - 1, // info
993  offsets_.back());
994  if (n >= sizeof(buf)) return false;
995  AppendString(buf);
996  return true;
997 }
void AppendString(const char *s)
Definition: renderer.cpp:99
T & back() const
int size() const
Definition: genericvector.h:72
#define TESSERACT_VERSION_STR
Definition: baseapi.h:23
const int kBasicBufSize
const char * title() const
Definition: renderer.h:80

The documentation for this class was generated from the following files: