tesseract  3.05.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
openclwrapper.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 #ifdef _WIN32
11 #include <io.h>
12 #else
13 #include <sys/types.h>
14 #include <unistd.h>
15 #endif
16 #include <float.h>
17 
18 #include "openclwrapper.h"
19 #include "oclkernels.h"
20 
21 // for micro-benchmark
22 #include "otsuthr.h"
23 #include "thresholder.h"
24 
25 #if ON_APPLE
26 #include <mach/mach_time.h>
27 #include <stdio.h>
28 #endif
29 
30 /*
31  Convenience macro to test the version of Leptonica.
32 */
33 #if defined(LIBLEPT_MAJOR_VERSION) && defined(LIBLEPT_MINOR_VERSION)
34 #define TESSERACT_LIBLEPT_PREREQ(maj, min) \
35  ((LIBLEPT_MAJOR_VERSION) > (maj) || \
36  ((LIBLEPT_MAJOR_VERSION) == (maj) && (LIBLEPT_MINOR_VERSION) >= (min)))
37 #else
38 #define TESSERACT_LIBLEPT_PREREQ(maj, min) 0
39 #endif
40 
41 #if TESSERACT_LIBLEPT_PREREQ(1, 73)
42 #define CALLOC LEPT_CALLOC
43 #define FREE LEPT_FREE
44 #endif
45 
46 #ifdef USE_OPENCL
47 
49 GPUEnv OpenclDevice::gpuEnv;
50 
51 bool OpenclDevice::deviceIsSelected = false;
52 ds_device OpenclDevice::selectedDevice;
53 
54 int OpenclDevice::isInited = 0;
55 
56 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
57 
58 static const l_uint32 lmask32[] = {
59  0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
60  0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
61  0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
62  0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
63  0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
64  0xfffffffe, 0xffffffff};
65 
66 static const l_uint32 rmask32[] = {
67  0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
68  0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
69  0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
70  0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
71  0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
72  0x7fffffff, 0xffffffff};
73 
74 struct tiff_transform {
75  int vflip; /* if non-zero, image needs a vertical fip */
76  int hflip; /* if non-zero, image needs a horizontal flip */
77  int rotate; /* -1 -> counterclockwise 90-degree rotation,
78  0 -> no rotation
79  1 -> clockwise 90-degree rotation */
80 };
81 
82 static struct tiff_transform tiff_orientation_transforms[] = {
83  {0, 0, 0},
84  {0, 1, 0},
85  {1, 1, 0},
86  {1, 0, 0},
87  {0, 1, -1},
88  {0, 0, 1},
89  {0, 1, 1},
90  {0, 0, -1}
91 };
92 
93 static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000;
94 
95 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
96 cl_mem pixThBuffer; //output from thresholdtopix calculation
97 cl_int clStatus;
98 KernelEnv rEnv;
99 
100 // substitute invalid characters in device name with _
101 void legalizeFileName( char *fileName) {
102  //printf("fileName: %s\n", fileName);
103  const char *invalidChars =
104  "/\?:*\"><| "; // space is valid but can cause headaches
105  // for each invalid char
106  for (int i = 0; i < strlen(invalidChars); i++) {
107  char invalidStr[4];
108  invalidStr[0] = invalidChars[i];
109  invalidStr[1] = '\0';
110  //printf("eliminating %s\n", invalidStr);
111  //char *pos = strstr(fileName, invalidStr);
112  // initial ./ is valid for present directory
113  //if (*pos == '.') pos++;
114  //if (*pos == '/') pos++;
115  for (char *pos = strstr(fileName, invalidStr); pos != NULL;
116  pos = strstr(pos + 1, invalidStr)) {
117  // printf("\tfound: %s, ", pos);
118  pos[0] = '_';
119  // printf("fileName: %s\n", fileName);
120  }
121  }
122 }
123 
124 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
125  //printf("[DS] populateGPUEnvFromDevice\n");
126  size_t size;
127  gpuInfo->mnIsUserCreated = 1;
128  // device
129  gpuInfo->mpDevID = device;
130  gpuInfo->mpArryDevsID = new cl_device_id[1];
131  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
132  clStatus =
133  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
134  sizeof(cl_device_type), &gpuInfo->mDevType, &size);
135  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
136  // platform
137  clStatus =
138  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
139  sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
140  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
141  // context
142  cl_context_properties props[3];
143  props[0] = CL_CONTEXT_PLATFORM;
144  props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
145  props[2] = 0;
146  gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL,
147  NULL, &clStatus);
148  CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
149  // queue
150  cl_command_queue_properties queueProperties = 0;
151  gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
152  CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
153 }
154 
155 int OpenclDevice::LoadOpencl()
156 {
157 #ifdef WIN32
158  HINSTANCE HOpenclDll = NULL;
159  void *OpenclDll = NULL;
160  // fprintf(stderr, " LoadOpenclDllxx... \n");
161  OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
162  OpenclDll = LoadLibrary("openCL.dll");
163  if (!static_cast<HINSTANCE>(OpenclDll)) {
164  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
165  FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
166  return 0;
167  }
168  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
169 #endif
170  return 1;
171 }
172 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
173 {
174  envInfo->mpkContext = gpuEnv.mpContext;
175  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
176  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
177 
178  return 1;
179 }
180 
181 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
182 {
183  cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
184  nElements * sizeof(l_uint32), hostbuffer, pStatus);
185 
186  return membuffer;
187 }
188 
189 PIX *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX *pixd, PIX *pixs,
190  int elements, cl_mem_flags flags, bool memcopy = false,
191  bool sync = true) {
192  PROCNAME("mapOutputCLBuffer");
193  if (!pixd) {
194  if (memcopy) {
195  if ((pixd = pixCreateTemplate(pixs)) == NULL)
196  (PIX *)ERROR_PTR("pixd not made", procName, NULL);
197  } else {
198  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
199  pixGetDepth(pixs))) == NULL)
200  (PIX *)ERROR_PTR("pixd not made", procName, NULL);
201  }
202  }
203  l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(
204  rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
205  elements * sizeof(l_uint32), 0, NULL, NULL, NULL);
206 
207  if (memcopy) {
208  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
209  } else {
210  pixSetData(pixd, pValues);
211  }
212 
213  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, NULL,
214  NULL);
215 
216  if (sync) {
217  clFinish(rEnv.mpkCmdQueue);
218  }
219 
220  return pixd;
221 }
222 
223  cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
224 {
225  cl_mem xValues =
226  clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(CL_MEM_READ_WRITE),
227  nElements * sizeof(l_int32), NULL, pStatus);
228 
229  if (_pValues != NULL) {
230  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer(
231  rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
232  nElements * sizeof(l_int32), 0, NULL, NULL, NULL);
233 
234  memcpy(pValues, _pValues, nElements * sizeof(l_int32));
235 
236  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, xValues, pValues, 0, NULL,
237  NULL);
238 
239  if (sync) clFinish(rEnv.mpkCmdQueue);
240  }
241 
242  return xValues;
243 }
244 
245 
246 void OpenclDevice::releaseMorphCLBuffers()
247 {
248  if (pixdCLIntermediate != NULL) clReleaseMemObject(pixdCLIntermediate);
249  if (pixsCLBuffer != NULL) clReleaseMemObject(pixsCLBuffer);
250  if (pixdCLBuffer != NULL) clReleaseMemObject(pixdCLBuffer);
251  if (pixThBuffer != NULL) clReleaseMemObject(pixThBuffer);
252  pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = NULL;
253 }
254 
255 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
256 {
257  SetKernelEnv( &rEnv );
258 
259  if (pixThBuffer != NULL) {
260  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl * h,
261  CL_MEM_ALLOC_HOST_PTR, &clStatus);
262 
263  // Get the output from ThresholdToPix operation
264  clStatus =
265  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
266  sizeof(l_uint32) * wpl * h, 0, NULL, NULL);
267  }
268  else
269  {
270  //Get data from the source image
271  l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
272  memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
273 
274  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
275  }
276 
277  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl * h,
278  CL_MEM_ALLOC_HOST_PTR, &clStatus);
279 
280  pixdCLIntermediate = allocateZeroCopyBuffer(
281  rEnv, NULL, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
282 
283  return (int)clStatus;
284 }
285 
286 int OpenclDevice::InitEnv()
287 {
288 //PERF_COUNT_START("OD::InitEnv")
289 // printf("[OD] OpenclDevice::InitEnv()\n");
290 #ifdef SAL_WIN32
291  while( 1 )
292  {
293  if( 1 == LoadOpencl() )
294  break;
295  }
296 PERF_COUNT_SUB("LoadOpencl")
297 #endif
298  // sets up environment, compiles programs
299 
300  InitOpenclRunEnv_DeviceSelection( 0 );
301 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
302 //PERF_COUNT_END
303  return 1;
304 }
305 
306 int OpenclDevice::ReleaseOpenclRunEnv()
307 {
308  ReleaseOpenclEnv( &gpuEnv );
309 #ifdef SAL_WIN32
310  FreeOpenclDll();
311 #endif
312  return 1;
313 }
314 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
315 {
316  if ( kCount < 1 )
317  fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
318  strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
319  gpuEnv.mnKernelCount++;
320  return 0;
321 }
322 int OpenclDevice::RegistOpenclKernel()
323 {
324  if ( !gpuEnv.mnIsUserCreated )
325  memset( &gpuEnv, 0, sizeof(gpuEnv) );
326 
327  gpuEnv.mnFileCount = 0; //argc;
328  gpuEnv.mnKernelCount = 0UL;
329 
330  AddKernelConfig( 1, (const char*) "oclAverageSub1" );
331  return 0;
332 }
333 
334 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
335 //PERF_COUNT_START("InitOpenclRunEnv_DS")
336  if (!isInited) {
337  // after programs compiled, selects best device
338  //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
339  ds_device bestDevice_DS = getDeviceSelection( );
340 //PERF_COUNT_SUB("called getDeviceSelection()")
341  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
342  // overwrite global static GPUEnv with new device
343  if (selectedDeviceIsOpenCL() ) {
344  //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
345  populateGPUEnvFromDevice( &gpuEnv, bestDevice );
346  gpuEnv.mnFileCount = 0; //argc;
347  gpuEnv.mnKernelCount = 0UL;
348 //PERF_COUNT_SUB("populate gpuEnv")
349  CompileKernelFile(&gpuEnv, "");
350 //PERF_COUNT_SUB("CompileKernelFile")
351  } else {
352  //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
353  }
354  isInited = 1;
355  }
356 //PERF_COUNT_END
357  return 0;
358 }
359 
360 
361 OpenclDevice::OpenclDevice()
362 {
363  //InitEnv();
364 }
365 
366 OpenclDevice::~OpenclDevice()
367 {
368  //ReleaseOpenclRunEnv();
369 }
370 
371 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
372 {
373  int i = 0;
374  int clStatus = 0;
375 
376  if ( !isInited )
377  {
378  return 1;
379  }
380 
381  for ( i = 0; i < gpuEnv.mnFileCount; i++ )
382  {
383  if ( gpuEnv.mpArryPrograms[i] )
384  {
385  clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
386  CHECK_OPENCL( clStatus, "clReleaseProgram" );
387  gpuEnv.mpArryPrograms[i] = NULL;
388  }
389  }
390  if ( gpuEnv.mpCmdQueue )
391  {
392  clReleaseCommandQueue( gpuEnv.mpCmdQueue );
393  gpuEnv.mpCmdQueue = NULL;
394  }
395  if ( gpuEnv.mpContext )
396  {
397  clReleaseContext( gpuEnv.mpContext );
398  gpuEnv.mpContext = NULL;
399  }
400  isInited = 0;
401  gpuInfo->mnIsUserCreated = 0;
402  delete[] gpuInfo->mpArryDevsID;
403  return 1;
404 }
405 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
406 {
407  unsigned int i = 0;
408  cl_int clStatus;
409  int status = 0;
410  char *str = NULL;
411  FILE *fd = NULL;
412  char fileName[256] = {0}, cl_name[128] = {0};
413  char deviceName[1024];
414  clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
415  sizeof(deviceName), deviceName, NULL);
416  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
417  str = (char *)strstr(clFileName, (char *)".cl");
418  memcpy(cl_name, clFileName, str - clFileName);
419  cl_name[str - clFileName] = '\0';
420  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
421  legalizeFileName(fileName);
422  fd = fopen(fileName, "rb");
423  status = (fd != NULL) ? 1 : 0;
424  if (fd != NULL) {
425  *fhandle = fd;
426  }
427  return status;
428 
429 }
430 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
431 {
432  int i;
433  for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
434  {
435  if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
436  {
437  if (gpuEnvCached->mpArryPrograms[i] != NULL) {
438  return 1;
439  }
440  }
441  }
442 
443  return 0;
444 }
445 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
446 {
447  FILE *output = NULL;
448  output = fopen(fileName, "wb");
449  if (output == NULL) {
450  return 0;
451  }
452 
453  fwrite( birary, sizeof(char), numBytes, output );
454  fclose( output );
455 
456  return 1;
457 
458 }
459 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
460 {
461  unsigned int i = 0;
462  cl_int clStatus;
463  size_t *binarySizes;
464  cl_uint numDevices;
465  cl_device_id *mpArryDevsID;
466  char **binaries, *str = NULL;
467 
468  clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
469  sizeof(numDevices), &numDevices, NULL);
470  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
471 
472  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
473  if (mpArryDevsID == NULL) {
474  return 0;
475  }
476  /* grab the handles to all of the devices in the program. */
477  clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
478  sizeof(cl_device_id) * numDevices, mpArryDevsID,
479  NULL);
480  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
481 
482  /* figure out the sizes of each of the binaries. */
483  binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
484 
485  clStatus =
486  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
487  sizeof(size_t) * numDevices, binarySizes, NULL);
488  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
489 
490  /* copy over all of the generated binaries. */
491  binaries = (char**) malloc( sizeof(char *) * numDevices );
492  if (binaries == NULL) {
493  return 0;
494  }
495 
496  for ( i = 0; i < numDevices; i++ )
497  {
498  if ( binarySizes[i] != 0 )
499  {
500  binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
501  if (binaries[i] == NULL) {
502  return 0;
503  }
504  }
505  else
506  {
507  binaries[i] = NULL;
508  }
509  }
510 
511  clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
512  sizeof(char *) * numDevices, binaries, NULL);
513  CHECK_OPENCL(clStatus,"clGetProgramInfo");
514 
515  /* dump out each binary into its own separate file. */
516  for ( i = 0; i < numDevices; i++ )
517  {
518  char fileName[256] = { 0 }, cl_name[128] = { 0 };
519 
520  if ( binarySizes[i] != 0 )
521  {
522  char deviceName[1024];
523  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
524  sizeof(deviceName), deviceName, NULL);
525  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
526 
527  str = (char*) strstr( clFileName, (char*) ".cl" );
528  memcpy( cl_name, clFileName, str - clFileName );
529  cl_name[str - clFileName] = '\0';
530  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
531  legalizeFileName(fileName);
532  if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
533  {
534  printf("[OD] write binary[%s] failed\n", fileName);
535  return 0;
536  } //else
537  printf("[OD] write binary[%s] successfully\n", fileName);
538  }
539  }
540 
541  // Release all resouces and memory
542  for ( i = 0; i < numDevices; i++ )
543  {
544  free(binaries[i]);
545  binaries[i] = NULL;
546  }
547 
548  free(binaries);
549  binaries = NULL;
550 
551  free(binarySizes);
552  binarySizes = NULL;
553 
554  free(mpArryDevsID);
555  mpArryDevsID = NULL;
556 
557  return 1;
558 }
559 
560 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
561 {
562  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer(
563  rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
564  nElements * sizeof(l_int32), 0, NULL, NULL, NULL);
565  clFinish(rEnv.mpkCmdQueue);
566  if (_pValues != NULL) {
567  for (int i = 0; i < (int)nElements; i++) pValues[i] = (l_int32)_pValues[i];
568  }
569 
570  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, xValues, pValues, 0, NULL,
571  NULL);
572  //clFinish( rEnv.mpkCmdQueue );
573  return;
574 }
575 
576 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
577 {
578 //PERF_COUNT_START("CompileKernelFile")
579  cl_int clStatus = 0;
580  size_t length;
581  char *buildLog = NULL, *binary;
582  const char *source;
583  size_t source_size[1];
584  int b_error, binary_status, binaryExisted, idx;
585  cl_uint numDevices;
586  cl_device_id *mpArryDevsID;
587  FILE *fd, *fd1;
588  const char* filename = "kernel.cl";
589  //fprintf(stderr, "[OD] CompileKernelFile ... \n");
590  if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
591  {
592  return 1;
593  }
594 
595  idx = gpuInfo->mnFileCount;
596 
597  source = kernel_src;
598 
599  source_size[0] = strlen( source );
600  binaryExisted = 0;
601  binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
602 //PERF_COUNT_SUB("BinaryGenerated")
603  if ( binaryExisted == 1 )
604  {
605  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
606  sizeof(numDevices), &numDevices, NULL);
607  CHECK_OPENCL(clStatus, "clGetContextInfo");
608 
609  mpArryDevsID = (cl_device_id *)malloc(sizeof(cl_device_id) * numDevices);
610  if (mpArryDevsID == NULL) {
611  return 0;
612  }
613 //PERF_COUNT_SUB("get numDevices")
614  b_error = 0;
615  length = 0;
616  b_error |= fseek( fd, 0, SEEK_END ) < 0;
617  b_error |= ( length = ftell(fd) ) <= 0;
618  b_error |= fseek( fd, 0, SEEK_SET ) < 0;
619  if ( b_error )
620  {
621  return 0;
622  }
623 
624  binary = (char*) malloc( length + 2 );
625  if ( !binary )
626  {
627  return 0;
628  }
629 
630  memset( binary, 0, length + 2 );
631  b_error |= fread( binary, 1, length, fd ) != length;
632 
633 
634  fclose( fd );
635 //PERF_COUNT_SUB("read file")
636  fd = NULL;
637  // grab the handles to all of the devices in the context.
638  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
639  sizeof(cl_device_id) * numDevices,
640  mpArryDevsID, NULL);
641  CHECK_OPENCL( clStatus, "clGetContextInfo" );
642 //PERF_COUNT_SUB("get devices")
643  //fprintf(stderr, "[OD] Create kernel from binary\n");
644  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
645  mpArryDevsID, &length, (const unsigned char**) &binary,
646  &binary_status, &clStatus );
647  CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
648 //PERF_COUNT_SUB("clCreateProgramWithBinary")
649  free( binary );
650  free( mpArryDevsID );
651  mpArryDevsID = NULL;
652  // PERF_COUNT_SUB("binaryExisted")
653  }
654  else
655  {
656  // create a CL program using the kernel source
657  //fprintf(stderr, "[OD] Create kernel from source\n");
658  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
659  source_size, &clStatus);
660  CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
661 //PERF_COUNT_SUB("!binaryExisted")
662  }
663 
664  if (gpuInfo->mpArryPrograms[idx] == (cl_program) NULL) {
665  return 0;
666  }
667 
668  //char options[512];
669  // create a cl program executable for all the devices specified
670  //printf("[OD] BuildProgram.\n");
671 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
672  if (!gpuInfo->mnIsUserCreated)
673  {
674  clStatus =
675  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
676  buildOption, NULL, NULL);
677  // PERF_COUNT_SUB("clBuildProgram notUserCreated")
678  }
679  else
680  {
681  clStatus =
682  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
683  buildOption, NULL, NULL);
684  // PERF_COUNT_SUB("clBuildProgram isUserCreated")
685  }
687  if ( clStatus != CL_SUCCESS )
688  {
689  printf ("BuildProgram error!\n");
690  if ( !gpuInfo->mnIsUserCreated )
691  {
692  clStatus = clGetProgramBuildInfo(
693  gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
694  CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
695  }
696  else
697  {
698  clStatus = clGetProgramBuildInfo(
699  gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
700  CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
701  }
702  if ( clStatus != CL_SUCCESS )
703  {
704  printf("opencl create build log fail\n");
705  return 0;
706  }
707  buildLog = (char*) malloc( length );
708  if (buildLog == (char *)NULL) {
709  return 0;
710  }
711  if ( !gpuInfo->mnIsUserCreated )
712  {
713  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
714  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
715  }
716  else
717  {
718  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
719  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
720  }
721  if ( clStatus != CL_SUCCESS )
722  {
723  printf("opencl program build info fail\n");
724  return 0;
725  }
726 
727  fd1 = fopen( "kernel-build.log", "w+" );
728  if (fd1 != NULL) {
729  fwrite(buildLog, sizeof(char), length, fd1);
730  fclose(fd1);
731  }
732 
733  free( buildLog );
734 //PERF_COUNT_SUB("build error log")
735  return 0;
736  }
737 
738  strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
739 //PERF_COUNT_SUB("strcpy")
740  if ( binaryExisted == 0 ) {
741  GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
742  PERF_COUNT_SUB("GenerateBinFromKernelSource")
743  }
744 
745  gpuInfo->mnFileCount += 1;
746 //PERF_COUNT_END
747  return 1;
748 }
749 
750 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
751 {
752 PERF_COUNT_START("pixReadFromTiffKernel")
753  cl_int clStatus;
754  KernelEnv rEnv;
755  size_t globalThreads[2];
756  size_t localThreads[2];
757  int gsize;
758  cl_mem valuesCl;
759  cl_mem outputCl;
760 
761  //global and local work dimensions for Horizontal pass
762  gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
763  globalThreads[0] = gsize;
764  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
765  globalThreads[1] = gsize;
766  localThreads[0] = GROUPSIZE_X;
767  localThreads[1] = GROUPSIZE_Y;
768 
769  SetKernelEnv( &rEnv );
770 
771  l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
772  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
773  CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
774 
775  //Allocate input and output OCL buffers
776  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
777  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
778 
779  //Kernel arguments
780  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
781  CHECK_OPENCL( clStatus, "clSetKernelArg");
782  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
783  CHECK_OPENCL( clStatus, "clSetKernelArg" );
784  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
785  CHECK_OPENCL( clStatus, "clSetKernelArg" );
786  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
787  CHECK_OPENCL( clStatus, "clSetKernelArg" );
788  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
789  CHECK_OPENCL( clStatus, "clSetKernelArg");
790 
791  //Kernel enqueue
792 PERF_COUNT_SUB("before")
793 clStatus =
794  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL,
795  globalThreads, localThreads, 0, NULL, NULL);
796 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
797 
798 /* map results back from gpu */
799 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
800  0, w * h * sizeof(l_uint32), 0, NULL, NULL,
801  &clStatus);
802 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
803 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
804 
805 // Sync
806 clFinish(rEnv.mpkCmdQueue);
807 PERF_COUNT_SUB("kernel & map")
809  return pResult;
810 }
811 
812 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
813 {
814 PERF_COUNT_START("pixReadTiffCL")
815  FILE *fp;
816 PIX *pix;
817 
818  //printf("pixReadTiffCl file");
819  PROCNAME("pixReadTiff");
820 
821  if (!filename)
822  return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
823 
824  if ((fp = fopenReadStream(filename)) == NULL)
825  return (PIX *)ERROR_PTR("image file not found", procName, NULL);
826  if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
827  fclose(fp);
828  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
829  }
830  fclose(fp);
832  return pix;
833 }
834 TIFF *
835 OpenclDevice::fopenTiffCl(FILE *fp,
836  const char *modestring)
837 {
838 l_int32 fd;
839 
840  PROCNAME("fopenTiff");
841 
842  if (!fp) return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
843  if (!modestring)
844  return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
845 
846  if ((fd = fileno(fp)) < 0)
847  return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
848  lseek(fd, 0, SEEK_SET);
849 
850  return TIFFFdOpen(fd, "TIFFstream", modestring);
851 }
852 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif,
853  l_int32 *pxres,
854  l_int32 *pyres)
855 {
856 l_uint16 resunit;
857 l_int32 foundxres, foundyres;
858 l_float32 fxres, fyres;
859 
860  PROCNAME("getTiffStreamResolution");
861 
862  if (!tif)
863  return ERROR_INT("tif not opened", procName, 1);
864  if (!pxres || !pyres)
865  return ERROR_INT("&xres and &yres not both defined", procName, 1);
866  *pxres = *pyres = 0;
867 
868  TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
869  foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
870  foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
871  if (!foundxres && !foundyres) return 1;
872  if (!foundxres && foundyres)
873  fxres = fyres;
874  else if (foundxres && !foundyres)
875  fyres = fxres;
876 
877  if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */
878  *pxres = (l_int32)(2.54 * fxres + 0.5);
879  *pyres = (l_int32)(2.54 * fyres + 0.5);
880  }
881  else {
882  *pxres = (l_int32)fxres;
883  *pyres = (l_int32)fyres;
884  }
885 
886  return 0;
887 }
888 
889 struct L_Memstream
890 {
891 l_uint8 *buffer; /* expands to hold data when written to; */
892  /* fixed size when read from. */
893 size_t bufsize; /* current size allocated when written to; */
894  /* fixed size of input data when read from. */
895 size_t offset; /* byte offset from beginning of buffer. */
896 size_t hw; /* high-water mark; max bytes in buffer. */
897 l_uint8 **poutdata; /* input param for writing; data goes here. */
898 size_t *poutsize; /* input param for writing; data size goes here. */
899 };
900 typedef struct L_Memstream L_MEMSTREAM;
901 
902 /* These are static functions for memory I/O */
903 static L_MEMSTREAM *memstreamCreateForRead(l_uint8 *indata, size_t pinsize);
904 static L_MEMSTREAM *memstreamCreateForWrite(l_uint8 **poutdata,
905  size_t *poutsize);
906 static tsize_t tiffReadCallback(thandle_t handle, tdata_t data, tsize_t length);
907 static tsize_t tiffWriteCallback(thandle_t handle, tdata_t data,
908  tsize_t length);
909 static toff_t tiffSeekCallback(thandle_t handle, toff_t offset, l_int32 whence);
910 static l_int32 tiffCloseCallback(thandle_t handle);
911 static toff_t tiffSizeCallback(thandle_t handle);
912 static l_int32 tiffMapCallback(thandle_t handle, tdata_t *data, toff_t *length);
913 static void tiffUnmapCallback(thandle_t handle, tdata_t data, toff_t length);
914 
915 
916 static L_MEMSTREAM *
917 memstreamCreateForRead(l_uint8 *indata,
918 size_t insize)
919 {
920  L_MEMSTREAM *mstream;
921 
922  mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
923  mstream->buffer = indata; /* handle to input data array */
924  mstream->bufsize = insize; /* amount of input data */
925  mstream->hw = insize; /* high-water mark fixed at input data size */
926  mstream->offset = 0; /* offset always starts at 0 */
927  return mstream;
928 }
929 
930 
931 static L_MEMSTREAM *
932 memstreamCreateForWrite(l_uint8 **poutdata,
933 size_t *poutsize)
934 {
935  L_MEMSTREAM *mstream;
936 
937  mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
938  mstream->buffer = (l_uint8 *)CALLOC(8 * 1024, 1);
939  mstream->bufsize = 8 * 1024;
940  mstream->poutdata = poutdata; /* used only at end of write */
941  mstream->poutsize = poutsize; /* ditto */
942  mstream->hw = mstream->offset = 0;
943  return mstream;
944 }
945 
946 
947 static tsize_t
948 tiffReadCallback(thandle_t handle,
949 tdata_t data,
950 tsize_t length)
951 {
952  L_MEMSTREAM *mstream;
953  size_t amount;
954 
955  mstream = (L_MEMSTREAM *)handle;
956  amount = L_MIN((size_t)length, mstream->hw - mstream->offset);
957  memcpy(data, mstream->buffer + mstream->offset, amount);
958  mstream->offset += amount;
959  return amount;
960 }
961 
962 
963 static tsize_t
964 tiffWriteCallback(thandle_t handle,
965 tdata_t data,
966 tsize_t length)
967 {
968  L_MEMSTREAM *mstream;
969  size_t newsize;
970 
971  /* reallocNew() uses calloc to initialize the array.
972  * If malloc is used instead, for some of the encoding methods,
973  * not all the data in 'bufsize' bytes in the buffer will
974  * have been initialized by the end of the compression. */
975  mstream = (L_MEMSTREAM *)handle;
976  if (mstream->offset + length > mstream->bufsize) {
977  newsize = 2 * (mstream->offset + length);
978  mstream->buffer = (l_uint8 *)reallocNew((void **)&mstream->buffer,
979  mstream->offset, newsize);
980  mstream->bufsize = newsize;
981  }
982 
983  memcpy(mstream->buffer + mstream->offset, data, length);
984  mstream->offset += length;
985  mstream->hw = L_MAX(mstream->offset, mstream->hw);
986  return length;
987 }
988 
989 
990 static toff_t
991 tiffSeekCallback(thandle_t handle,
992 toff_t offset,
993 l_int32 whence)
994 {
995  L_MEMSTREAM *mstream;
996 
997  PROCNAME("tiffSeekCallback");
998  mstream = (L_MEMSTREAM *)handle;
999  switch (whence) {
1000  case SEEK_SET:
1001  /* fprintf(stderr, "seek_set: offset = %d\n", offset); */
1002  mstream->offset = offset;
1003  break;
1004  case SEEK_CUR:
1005  /* fprintf(stderr, "seek_cur: offset = %d\n", offset); */
1006  mstream->offset += offset;
1007  break;
1008  case SEEK_END:
1009  /* fprintf(stderr, "seek end: hw = %d, offset = %d\n",
1010  mstream->hw, offset); */
1011  mstream->offset = mstream->hw - offset; /* offset >= 0 */
1012  break;
1013  default:
1014  return (toff_t)ERROR_INT("bad whence value", procName,
1015  mstream->offset);
1016  }
1017 
1018  return mstream->offset;
1019 }
1020 
1021 
1022 static l_int32
1023 tiffCloseCallback(thandle_t handle)
1024 {
1025  L_MEMSTREAM *mstream;
1026 
1027  mstream = (L_MEMSTREAM *)handle;
1028  if (mstream->poutdata) { /* writing: save the output data */
1029  *mstream->poutdata = mstream->buffer;
1030  *mstream->poutsize = mstream->hw;
1031  }
1032  FREE(mstream); /* never free the buffer! */
1033  return 0;
1034 }
1035 
1036 
1037 static toff_t
1038 tiffSizeCallback(thandle_t handle)
1039 {
1040  L_MEMSTREAM *mstream;
1041 
1042  mstream = (L_MEMSTREAM *)handle;
1043  return mstream->hw;
1044 }
1045 
1046 
1047 static l_int32
1048 tiffMapCallback(thandle_t handle,
1049 tdata_t *data,
1050 toff_t *length)
1051 {
1052  L_MEMSTREAM *mstream;
1053 
1054  mstream = (L_MEMSTREAM *)handle;
1055  *data = mstream->buffer;
1056  *length = mstream->hw;
1057  return 0;
1058 }
1059 
1060 
1061 static void
1062 tiffUnmapCallback(thandle_t handle,
1063 tdata_t data,
1064 toff_t length)
1065 {
1066  return;
1067 }
1068 
1069 
1086 static TIFF *
1087 fopenTiffMemstream(const char *filename,
1088 const char *operation,
1089 l_uint8 **pdata,
1090 size_t *pdatasize)
1091 {
1092  L_MEMSTREAM *mstream;
1093 
1094  PROCNAME("fopenTiffMemstream");
1095 
1096  if (!filename)
1097  return (TIFF *)ERROR_PTR("filename not defined", procName, NULL);
1098  if (!operation)
1099  return (TIFF *)ERROR_PTR("operation not defined", procName, NULL);
1100  if (!pdata)
1101  return (TIFF *)ERROR_PTR("&data not defined", procName, NULL);
1102  if (!pdatasize)
1103  return (TIFF *)ERROR_PTR("&datasize not defined", procName, NULL);
1104  if (!strcmp(operation, "r") && !strcmp(operation, "w"))
1105  return (TIFF *)ERROR_PTR("operation not 'r' or 'w'}", procName,
1106  NULL);
1107 
1108  if (!strcmp(operation, "r"))
1109  mstream = memstreamCreateForRead(*pdata, *pdatasize);
1110  else
1111  mstream = memstreamCreateForWrite(pdata, pdatasize);
1112 
1113  return TIFFClientOpen(filename, operation, mstream, tiffReadCallback,
1114  tiffWriteCallback, tiffSeekCallback,
1115  tiffCloseCallback, tiffSizeCallback,
1116  tiffMapCallback, tiffUnmapCallback);
1117 }
1118 
1119 
1120 
1121 PIX *
1122 OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32 n)
1123 {
1124  l_int32 i, pagefound;
1125  PIX *pix;
1126  TIFF *tif;
1127  // L_MEMSTREAM *memStream;
1128  PROCNAME("pixReadMemTiffCl");
1129 
1130  if (!data)
1131  return (PIX *)ERROR_PTR("data pointer is NULL", procName, NULL);
1132 
1133  if ((tif = fopenTiffMemstream("", "r", (l_uint8 **)&data, &size)) ==
1134  NULL)
1135  return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
1136 
1137  pagefound = FALSE;
1138  pix = NULL;
1139  for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
1140  if (i == n) {
1141  pagefound = TRUE;
1142  if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
1143  TIFFCleanup(tif);
1144  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1145  }
1146  break;
1147  }
1148  if (TIFFReadDirectory(tif) == 0) break;
1149  }
1150 
1151  if (pagefound == FALSE) {
1152  L_WARNING("tiff page %d not found\n", procName, i);
1153  TIFFCleanup(tif);
1154  return NULL;
1155  }
1156 
1157  TIFFCleanup(tif);
1158  return pix;
1159 }
1160 
1161 PIX *
1162 OpenclDevice::pixReadStreamTiffCl(FILE *fp,
1163  l_int32 n)
1164 {
1165 l_int32 i, pagefound;
1166 PIX *pix;
1167 TIFF *tif;
1168 
1169  PROCNAME("pixReadStreamTiff");
1170 
1171  if (!fp) return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
1172 
1173  if ((tif = fopenTiffCl(fp, "rb")) == NULL)
1174  return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
1175 
1176  pagefound = FALSE;
1177  pix = NULL;
1178  for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
1179  if (i == n) {
1180  pagefound = TRUE;
1181  if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
1182  TIFFCleanup(tif);
1183  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1184  }
1185  break;
1186  }
1187  if (TIFFReadDirectory(tif) == 0)
1188  break;
1189  }
1190 
1191  if (pagefound == FALSE) {
1192  L_WARNING("tiff page %d not found", procName, n);
1193  TIFFCleanup(tif);
1194  return NULL;
1195  }
1196 
1197  TIFFCleanup(tif);
1198  return pix;
1199 }
1200 
1201 static l_int32
1202 getTiffCompressedFormat(l_uint16 tiffcomp)
1203 {
1204 l_int32 comptype;
1205 
1206  switch (tiffcomp)
1207  {
1208  case COMPRESSION_CCITTFAX4:
1209  comptype = IFF_TIFF_G4;
1210  break;
1211  case COMPRESSION_CCITTFAX3:
1212  comptype = IFF_TIFF_G3;
1213  break;
1214  case COMPRESSION_CCITTRLE:
1215  comptype = IFF_TIFF_RLE;
1216  break;
1217  case COMPRESSION_PACKBITS:
1218  comptype = IFF_TIFF_PACKBITS;
1219  break;
1220  case COMPRESSION_LZW:
1221  comptype = IFF_TIFF_LZW;
1222  break;
1223  case COMPRESSION_ADOBE_DEFLATE:
1224  comptype = IFF_TIFF_ZIP;
1225  break;
1226  default:
1227  comptype = IFF_TIFF;
1228  break;
1229  }
1230  return comptype;
1231 }
1232 
1233 void compare(l_uint32 *cpu, l_uint32 *gpu,int size)
1234 {
1235  for(int i=0;i<size;i++)
1236  {
1237  if(cpu[i]!=gpu[i])
1238  {
1239  printf("\ndoesnot match\n");
1240  return;
1241  }
1242  }
1243  printf("\nit matches\n");
1244 }
1245 
1246 //OpenCL implementation of pixReadFromTiffStream.
1247 //Similar to the CPU implentation of pixReadFromTiffStream
1248 PIX *
1249 OpenclDevice::pixReadFromTiffStreamCl(TIFF *tif)
1250 {
1251 l_uint8 *linebuf, *data;
1252 l_uint16 spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
1253 l_uint16 *redmap, *greenmap, *bluemap;
1254 l_int32 d, wpl, bpl, comptype, i, ncolors;
1255 l_int32 xres, yres;
1256 l_uint32 w, h;
1257 l_uint32 *line, *tiffdata;
1258 PIX *pix;
1259 PIXCMAP *cmap;
1260 
1261  PROCNAME("pixReadFromTiffStream");
1262 
1263  if (!tif) return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
1264 
1265  TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
1266  TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
1267  bpp = bps * spp;
1268  if (bpp > 32)
1269  return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
1270  if (spp == 1)
1271  d = bps;
1272  else if (spp == 3 || spp == 4)
1273  d = 32;
1274  else
1275  return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
1276 
1277  TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
1278  TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
1279  tiffbpl = TIFFScanlineSize(tif);
1280 
1281  if ((pix = pixCreate(w, h, d)) == NULL)
1282  return (PIX *)ERROR_PTR("pix not made", procName, NULL);
1283  data = (l_uint8 *)pixGetData(pix);
1284  wpl = pixGetWpl(pix);
1285  bpl = 4 * wpl;
1286 
1287  if (spp == 1) {
1288  if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) ==
1289  NULL)
1290  return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
1291 
1292  for (i = 0; i < h; i++) {
1293  if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
1294  FREE(linebuf);
1295  pixDestroy(&pix);
1296  return (PIX *)ERROR_PTR("line read fail", procName, NULL);
1297  }
1298  memcpy((char *)data, (char *)linebuf, tiffbpl);
1299  data += bpl;
1300  }
1301  if (bps <= 8)
1302  pixEndianByteSwap(pix);
1303  else
1304  pixEndianTwoByteSwap(pix);
1305  FREE(linebuf);
1306  } else {
1307  if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
1308  pixDestroy(&pix);
1309  return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
1310  }
1311  if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
1312  ORIENTATION_TOPLEFT, 0)) {
1313  FREE(tiffdata);
1314  pixDestroy(&pix);
1315  return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
1316  }
1317  line = pixGetData(pix);
1318 
1319  // Invoke the OpenCL kernel for pixReadFromTiff
1320  l_uint32 *output_gpu = pixReadFromTiffKernel(tiffdata, w, h, wpl, line);
1321 
1322  pixSetData(pix, output_gpu);
1323  // pix already has data allocated, it now points to output_gpu?
1324  FREE(tiffdata);
1325  FREE(line);
1326  // FREE(output_gpu);
1327  }
1328 
1329  if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
1330  pixSetXRes(pix, xres);
1331  pixSetYRes(pix, yres);
1332  }
1333 
1334 
1335  TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
1336  comptype = getTiffCompressedFormat(tiffcomp);
1337  pixSetInputFormat(pix, comptype);
1338 
1339  if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
1340  if ((cmap = pixcmapCreate(bps)) == NULL) {
1341  pixDestroy(&pix);
1342  return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
1343  }
1344  ncolors = 1 << bps;
1345  for (i = 0; i < ncolors; i++)
1346  pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
1347  bluemap[i] >> 8);
1348  pixSetColormap(pix, cmap);
1349  } else {
1350  if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
1351  if (tiffcomp == COMPRESSION_CCITTFAX3 ||
1352  tiffcomp == COMPRESSION_CCITTFAX4 ||
1353  tiffcomp == COMPRESSION_CCITTRLE ||
1354  tiffcomp == COMPRESSION_CCITTRLEW) {
1355  photometry = PHOTOMETRIC_MINISWHITE;
1356  } else
1357  photometry = PHOTOMETRIC_MINISBLACK;
1358  }
1359  if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
1360  (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
1361  pixInvert(pix, pix);
1362  }
1363 
1364  if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
1365  if (orientation >= 1 && orientation <= 8) {
1366  struct tiff_transform *transform =
1367  &tiff_orientation_transforms[orientation - 1];
1368  if (transform->vflip) pixFlipTB(pix, pix);
1369  if (transform->hflip) pixFlipLR(pix, pix);
1370  if (transform->rotate) {
1371  PIX *oldpix = pix;
1372  pix = pixRotate90(oldpix, transform->rotate);
1373  pixDestroy(&oldpix);
1374  }
1375  }
1376  }
1377 
1378  return pix;
1379 }
1380 
1381 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1382 cl_int
1383 pixDilateCL_55(l_int32 wpl, l_int32 h)
1384 {
1385  size_t globalThreads[2];
1386  cl_mem pixtemp;
1387  cl_int status;
1388  int gsize;
1389  size_t localThreads[2];
1390 
1391  //Horizontal pass
1392  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1393  globalThreads[0] = gsize;
1394  globalThreads[1] = GROUPSIZE_HMORY;
1395  localThreads[0] = GROUPSIZE_HMORX;
1396  localThreads[1] = GROUPSIZE_HMORY;
1397 
1398  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
1399  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1400 
1401  status = clSetKernelArg(rEnv.mpkKernel,
1402  0,
1403  sizeof(cl_mem),
1404  &pixsCLBuffer);
1405  status = clSetKernelArg(rEnv.mpkKernel,
1406  1,
1407  sizeof(cl_mem),
1408  &pixdCLBuffer);
1409  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1410  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1411 
1412  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1413  NULL, globalThreads, localThreads, 0,
1414  NULL, NULL);
1415 
1416  //Swap source and dest buffers
1417  pixtemp = pixsCLBuffer;
1418  pixsCLBuffer = pixdCLBuffer;
1419  pixdCLBuffer = pixtemp;
1420 
1421  //Vertical
1422  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1423  globalThreads[0] = gsize;
1424  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1425  globalThreads[1] = gsize;
1426  localThreads[0] = GROUPSIZE_X;
1427  localThreads[1] = GROUPSIZE_Y;
1428 
1429  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
1430  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1431 
1432  status = clSetKernelArg(rEnv.mpkKernel,
1433  0,
1434  sizeof(cl_mem),
1435  &pixsCLBuffer);
1436  status = clSetKernelArg(rEnv.mpkKernel,
1437  1,
1438  sizeof(cl_mem),
1439  &pixdCLBuffer);
1440  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1441  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1442  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1443  NULL, globalThreads, localThreads, 0,
1444  NULL, NULL);
1445 
1446  return status;
1447 }
1448 
1449 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1450 cl_int
1451 pixErodeCL_55(l_int32 wpl, l_int32 h)
1452 {
1453  size_t globalThreads[2];
1454  cl_mem pixtemp;
1455  cl_int status;
1456  int gsize;
1457  l_uint32 fwmask, lwmask;
1458  size_t localThreads[2];
1459 
1460  lwmask = lmask32[31 - 2];
1461  fwmask = rmask32[31 - 2];
1462 
1463  //Horizontal pass
1464  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1465  globalThreads[0] = gsize;
1466  globalThreads[1] = GROUPSIZE_HMORY;
1467  localThreads[0] = GROUPSIZE_HMORX;
1468  localThreads[1] = GROUPSIZE_HMORY;
1469 
1470  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
1471  CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1472 
1473  status = clSetKernelArg(rEnv.mpkKernel,
1474  0,
1475  sizeof(cl_mem),
1476  &pixsCLBuffer);
1477  status = clSetKernelArg(rEnv.mpkKernel,
1478  1,
1479  sizeof(cl_mem),
1480  &pixdCLBuffer);
1481  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1482  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1483 
1484  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1485  NULL, globalThreads, localThreads, 0,
1486  NULL, NULL);
1487 
1488  //Swap source and dest buffers
1489  pixtemp = pixsCLBuffer;
1490  pixsCLBuffer = pixdCLBuffer;
1491  pixdCLBuffer = pixtemp;
1492 
1493  //Vertical
1494  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1495  globalThreads[0] = gsize;
1496  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1497  globalThreads[1] = gsize;
1498  localThreads[0] = GROUPSIZE_X;
1499  localThreads[1] = GROUPSIZE_Y;
1500 
1501  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
1502  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1503 
1504  status = clSetKernelArg(rEnv.mpkKernel,
1505  0,
1506  sizeof(cl_mem),
1507  &pixsCLBuffer);
1508  status = clSetKernelArg(rEnv.mpkKernel,
1509  1,
1510  sizeof(cl_mem),
1511  &pixdCLBuffer);
1512  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1513  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1514  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1515  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1516  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1517  NULL, globalThreads, localThreads, 0,
1518  NULL, NULL);
1519 
1520  return status;
1521 }
1522 
1523 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
1524 cl_int
1525 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1526 {
1527  l_int32 xp, yp, xn, yn;
1528  SEL* sel;
1529  size_t globalThreads[2];
1530  cl_mem pixtemp;
1531  cl_int status;
1532  int gsize;
1533  size_t localThreads[2];
1534  char isEven;
1535 
1536  OpenclDevice::SetKernelEnv( &rEnv );
1537 
1538  if (hsize == 5 && vsize == 5)
1539  {
1540  //Specific case for 5x5
1541  status = pixDilateCL_55(wpl, h);
1542  return status;
1543  }
1544 
1545  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1546 
1547  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1548  selDestroy(&sel);
1549  //global and local work dimensions for Horizontal pass
1550  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1551  globalThreads[0] = gsize;
1552  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1553  globalThreads[1] = gsize;
1554  localThreads[0] = GROUPSIZE_X;
1555  localThreads[1] = GROUPSIZE_Y;
1556 
1557  if (xp > 31 || xn > 31)
1558  {
1559  // Generic case.
1560  rEnv.mpkKernel =
1561  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1562  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1563 
1564  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1565  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1566  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1567  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1568  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1569  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1570  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1571  NULL, globalThreads, localThreads, 0,
1572  NULL, NULL);
1573 
1574  if (yp > 0 || yn > 0) {
1575  pixtemp = pixsCLBuffer;
1576  pixsCLBuffer = pixdCLBuffer;
1577  pixdCLBuffer = pixtemp;
1578  }
1579  }
1580  else if (xp > 0 || xn > 0 )
1581  {
1582  // Specific Horizontal pass kernel for half width < 32
1583  rEnv.mpkKernel =
1584  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1585  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1586  isEven = (xp != xn);
1587 
1588  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1589  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1590  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1591  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1592  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1593  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1594  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1595  NULL, globalThreads, localThreads, 0,
1596  NULL, NULL);
1597 
1598  if (yp > 0 || yn > 0) {
1599  pixtemp = pixsCLBuffer;
1600  pixsCLBuffer = pixdCLBuffer;
1601  pixdCLBuffer = pixtemp;
1602  }
1603  }
1604 
1605  if (yp > 0 || yn > 0)
1606  {
1607  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
1608  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1609 
1610  status = clSetKernelArg(rEnv.mpkKernel,
1611  0,
1612  sizeof(cl_mem),
1613  &pixsCLBuffer);
1614  status = clSetKernelArg(rEnv.mpkKernel,
1615  1,
1616  sizeof(cl_mem),
1617  &pixdCLBuffer);
1618  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1619  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1620  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1621  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1622  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1623  NULL, globalThreads, localThreads, 0,
1624  NULL, NULL);
1625  }
1626 
1627  return status;
1628 }
1629 
1630 //Morphology Erode operation. Invokes the relevant OpenCL kernels
1631 cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1632  l_int32 xp, yp, xn, yn;
1633  SEL *sel;
1634  size_t globalThreads[2];
1635  size_t localThreads[2];
1636  cl_mem pixtemp;
1637  cl_int status;
1638  int gsize;
1639  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1640  l_uint32 rwmask, lwmask;
1641  char isEven;
1642 
1643  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1644 
1645  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1646  selDestroy(&sel);
1647  OpenclDevice::SetKernelEnv(&rEnv);
1648 
1649  if (hsize == 5 && vsize == 5 && isAsymmetric) {
1650  // Specific kernel for 5x5
1651  status = pixErodeCL_55(wpl, h);
1652  return status;
1653  }
1654 
1655  lwmask = lmask32[31 - (xn & 31)];
1656  rwmask = rmask32[31 - (xp & 31)];
1657 
1658  // global and local work dimensions for Horizontal pass
1659  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1660  globalThreads[0] = gsize;
1661  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1662  globalThreads[1] = gsize;
1663  localThreads[0] = GROUPSIZE_X;
1664  localThreads[1] = GROUPSIZE_Y;
1665 
1666  // Horizontal Pass
1667  if (xp > 31 || xn > 31) {
1668  // Generic case.
1669  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1670 
1671  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1672  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1673  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1674  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1675  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1676  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1677  status =
1678  clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1679  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1680  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1681  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1682  NULL, globalThreads, localThreads, 0,
1683  NULL, NULL);
1684 
1685  if (yp > 0 || yn > 0) {
1686  pixtemp = pixsCLBuffer;
1687  pixsCLBuffer = pixdCLBuffer;
1688  pixdCLBuffer = pixtemp;
1689  }
1690  } else if (xp > 0 || xn > 0) {
1691  rEnv.mpkKernel =
1692  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1693  isEven = (xp != xn);
1694 
1695  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1696  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1697  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1698  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1699  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1700  status =
1701  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1702  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1703  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1704  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1705  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1706  NULL, globalThreads, localThreads, 0,
1707  NULL, NULL);
1708 
1709  if (yp > 0 || yn > 0) {
1710  pixtemp = pixsCLBuffer;
1711  pixsCLBuffer = pixdCLBuffer;
1712  pixdCLBuffer = pixtemp;
1713  }
1714  }
1715 
1716  // Vertical Pass
1717  if (yp > 0 || yn > 0) {
1718  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1719  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1720 
1721  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1722  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1723  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1724  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1725  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1726  status =
1727  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1728  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1729  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1730  NULL, globalThreads, localThreads, 0,
1731  NULL, NULL);
1732  }
1733 
1734  return status;
1735 }
1736 
1737 // OpenCL implementation of Morphology Dilate
1738 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1739 PIX *OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize,
1740  l_int32 vsize, bool reqDataCopy = false) {
1741  l_uint32 wpl, h;
1742 
1743  wpl = pixGetWpl(pixs);
1744  h = pixGetHeight(pixs);
1745 
1746  clStatus = pixDilateCL(hsize, vsize, wpl, h);
1747 
1748  if (reqDataCopy) {
1749  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl * h,
1750  CL_MAP_READ, false);
1751  }
1752 
1753  return pixd;
1754 }
1755 
1756 // OpenCL implementation of Morphology Erode
1757 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1758 PIX *OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize,
1759  l_int32 vsize, bool reqDataCopy = false) {
1760  l_uint32 wpl, h;
1761 
1762  wpl = pixGetWpl(pixs);
1763  h = pixGetHeight(pixs);
1764 
1765  clStatus = pixErodeCL(hsize, vsize, wpl, h);
1766 
1767  if (reqDataCopy) {
1768  pixd =
1769  mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl * h, CL_MAP_READ);
1770  }
1771 
1772  return pixd;
1773 }
1774 
1775 //Morphology Open operation. Invokes the relevant OpenCL kernels
1776 cl_int
1777 pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1778 {
1779  cl_int status;
1780  cl_mem pixtemp;
1781 
1782  //Erode followed by Dilate
1783  status = pixErodeCL(hsize, vsize, wpl, h);
1784 
1785  pixtemp = pixsCLBuffer;
1786  pixsCLBuffer = pixdCLBuffer;
1787  pixdCLBuffer = pixtemp;
1788 
1789  status = pixDilateCL(hsize, vsize, wpl, h);
1790 
1791  return status;
1792 }
1793 
1794 //Morphology Close operation. Invokes the relevant OpenCL kernels
1795 cl_int
1796 pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1797 {
1798  cl_int status;
1799  cl_mem pixtemp;
1800 
1801  //Dilate followed by Erode
1802  status = pixDilateCL(hsize, vsize, wpl, h);
1803 
1804  pixtemp = pixsCLBuffer;
1805  pixsCLBuffer = pixdCLBuffer;
1806  pixdCLBuffer = pixtemp;
1807 
1808  status = pixErodeCL(hsize, vsize, wpl, h);
1809 
1810  return status;
1811 }
1812 
1813 // OpenCL implementation of Morphology Close
1814 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1815 PIX *OpenclDevice::pixCloseBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize,
1816  l_int32 vsize, bool reqDataCopy = false) {
1817  l_uint32 wpl, h;
1818 
1819  wpl = pixGetWpl(pixs);
1820  h = pixGetHeight(pixs);
1821 
1822  clStatus = pixCloseCL(hsize, vsize, wpl, h);
1823 
1824  if (reqDataCopy) {
1825  pixd =
1826  mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl * h, CL_MAP_READ);
1827  }
1828 
1829  return pixd;
1830 }
1831 
1832 // OpenCL implementation of Morphology Open
1833 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1834 PIX *OpenclDevice::pixOpenBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize,
1835  l_int32 vsize, bool reqDataCopy = false) {
1836  l_uint32 wpl, h;
1837 
1838  wpl = pixGetWpl(pixs);
1839  h = pixGetHeight(pixs);
1840 
1841  clStatus = pixOpenCL(hsize, vsize, wpl, h);
1842 
1843  if (reqDataCopy) {
1844  pixd =
1845  mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl * h, CL_MAP_READ);
1846  }
1847 
1848  return pixd;
1849 }
1850 
1851 //pix OR operation: outbuffer = buffer1 | buffer2
1852 cl_int
1853 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
1854 {
1855  cl_int status;
1856  size_t globalThreads[2];
1857  int gsize;
1858  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1859 
1860  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1861  globalThreads[0] = gsize;
1862  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1863  globalThreads[1] = gsize;
1864 
1865  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
1866  CHECK_OPENCL(status, "clCreateKernel pixOR");
1867 
1868  status = clSetKernelArg(rEnv.mpkKernel,
1869  0,
1870  sizeof(cl_mem),
1871  &buffer1);
1872  status = clSetKernelArg(rEnv.mpkKernel,
1873  1,
1874  sizeof(cl_mem),
1875  &buffer2);
1876  status = clSetKernelArg(rEnv.mpkKernel,
1877  2,
1878  sizeof(cl_mem),
1879  &outbuffer);
1880  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1881  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1882  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1883  NULL, globalThreads, localThreads, 0,
1884  NULL, NULL);
1885 
1886  return status;
1887 }
1888 
1889 //pix AND operation: outbuffer = buffer1 & buffer2
1890 cl_int
1891 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
1892 {
1893  cl_int status;
1894  size_t globalThreads[2];
1895  int gsize;
1896  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1897 
1898  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1899  globalThreads[0] = gsize;
1900  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1901  globalThreads[1] = gsize;
1902 
1903  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
1904  CHECK_OPENCL(status, "clCreateKernel pixAND");
1905 
1906  // Enqueue a kernel run call.
1907  status = clSetKernelArg(rEnv.mpkKernel,
1908  0,
1909  sizeof(cl_mem),
1910  &buffer1);
1911  status = clSetKernelArg(rEnv.mpkKernel,
1912  1,
1913  sizeof(cl_mem),
1914  &buffer2);
1915  status = clSetKernelArg(rEnv.mpkKernel,
1916  2,
1917  sizeof(cl_mem),
1918  &outbuffer);
1919  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1920  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1921  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1922  NULL, globalThreads, localThreads, 0,
1923  NULL, NULL);
1924 
1925  return status;
1926 }
1927 
1928 //output = buffer1 & ~(buffer2)
1929 cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1930  cl_mem buffer2, cl_mem outBuffer = NULL) {
1931  cl_int status;
1932  size_t globalThreads[2];
1933  int gsize;
1934  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1935 
1936  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1937  globalThreads[0] = gsize;
1938  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1939  globalThreads[1] = gsize;
1940 
1941  if (outBuffer != NULL) {
1942  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract", &status);
1943  CHECK_OPENCL(status, "clCreateKernel pixSubtract");
1944  } else {
1945  rEnv.mpkKernel =
1946  clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1947  CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1948  }
1949 
1950  // Enqueue a kernel run call.
1951  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1952  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1953  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1954  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1955  if (outBuffer != NULL) {
1956  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outBuffer);
1957  }
1958  status =
1959  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL,
1960  globalThreads, localThreads, 0, NULL, NULL);
1961 
1962  return status;
1963 }
1964 
1965 // OpenCL implementation of Subtract pix
1966 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1967 PIX *OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2,
1968  bool reqDataCopy = false) {
1969  l_uint32 wpl, h;
1970 
1971  PROCNAME("pixSubtractCL");
1972 
1973  if (!pixs1) return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
1974  if (!pixs2) return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
1975  if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
1976  return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
1977 
1978 #if EQUAL_SIZE_WARNING
1979  if (!pixSizesEqual(pixs1, pixs2))
1980  L_WARNING("pixs1 and pixs2 not equal sizes", procName);
1981 #endif /* EQUAL_SIZE_WARNING */
1982 
1983  wpl = pixGetWpl(pixs1);
1984  h = pixGetHeight(pixs1);
1985 
1986  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1987 
1988  if (reqDataCopy)
1989  {
1990  //Read back output data from OCL buffer to cpu
1991  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
1992  }
1993 
1994  return pixd;
1995 }
1996 
1997 // OpenCL implementation of Hollow pix
1998 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1999 PIX *OpenclDevice::pixHollowCL(PIX *pixd, PIX *pixs, l_int32 close_hsize,
2000  l_int32 close_vsize, l_int32 open_hsize,
2001  l_int32 open_vsize, bool reqDataCopy = false) {
2002  l_uint32 wpl, h;
2003  cl_mem pixtemp;
2004 
2005  wpl = pixGetWpl(pixs);
2006  h = pixGetHeight(pixs);
2007 
2008  // First step : Close Morph operation: Dilate followed by Erode
2009  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
2010 
2011  // Store the output of close operation in an intermediate buffer
2012  // this will be later used for pixsubtract
2013  clStatus =
2014  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
2015  0, sizeof(int) * wpl * h, 0, NULL, NULL);
2016 
2017  // Second step: Open Operation - Erode followed by Dilate
2018  pixtemp = pixsCLBuffer;
2019  pixsCLBuffer = pixdCLBuffer;
2020  pixdCLBuffer = pixtemp;
2021 
2022  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
2023 
2024  // Third step: Subtract : (Close - Open)
2025  pixtemp = pixsCLBuffer;
2026  pixsCLBuffer = pixdCLBuffer;
2027  pixdCLBuffer = pixdCLIntermediate;
2028  pixdCLIntermediate = pixtemp;
2029 
2030  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2031 
2032  if (reqDataCopy) {
2033  // Read back output data from OCL buffer to cpu
2034  pixd =
2035  mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl * h, CL_MAP_READ);
2036  }
2037  return pixd;
2038 }
2039 
2040 // OpenCL implementation of Get Lines from pix function
2041 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2042 void OpenclDevice::pixGetLinesCL(PIX *pixd, PIX *pixs, PIX **pix_vline,
2043  PIX **pix_hline, PIX **pixClosed,
2044  bool getpixClosed, l_int32 close_hsize,
2045  l_int32 close_vsize, l_int32 open_hsize,
2046  l_int32 open_vsize, l_int32 line_hsize,
2047  l_int32 line_vsize) {
2048  l_uint32 wpl, h;
2049  cl_mem pixtemp;
2050 
2051  wpl = pixGetWpl(pixs);
2052  h = pixGetHeight(pixs);
2053 
2054  // First step : Close Morph operation: Dilate followed by Erode
2055  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
2056 
2057  // Copy the Close output to CPU buffer
2058  if (getpixClosed) {
2059  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
2060  wpl * h, CL_MAP_READ, true, false);
2061  }
2062 
2063  // Store the output of close operation in an intermediate buffer
2064  // this will be later used for pixsubtract
2065  clStatus =
2066  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
2067  0, sizeof(int) * wpl * h, 0, NULL, NULL);
2068 
2069  // Second step: Open Operation - Erode followed by Dilate
2070  pixtemp = pixsCLBuffer;
2071  pixsCLBuffer = pixdCLBuffer;
2072  pixdCLBuffer = pixtemp;
2073 
2074  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
2075 
2076  // Third step: Subtract : (Close - Open)
2077  pixtemp = pixsCLBuffer;
2078  pixsCLBuffer = pixdCLBuffer;
2079  pixdCLBuffer = pixdCLIntermediate;
2080  pixdCLIntermediate = pixtemp;
2081 
2082  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2083 
2084  // Store the output of Hollow operation in an intermediate buffer
2085  // this will be later used
2086  clStatus =
2087  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
2088  0, sizeof(int) * wpl * h, 0, NULL, NULL);
2089 
2090  pixtemp = pixsCLBuffer;
2091  pixsCLBuffer = pixdCLBuffer;
2092  pixdCLBuffer = pixtemp;
2093 
2094  // Fourth step: Get vertical line
2095  // pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
2096  clStatus = pixOpenCL(1, line_vsize, wpl, h);
2097 
2098  // Copy the vertical line output to CPU buffer
2099  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
2100  CL_MAP_READ, true, false);
2101 
2102  pixtemp = pixsCLBuffer;
2103  pixsCLBuffer = pixdCLIntermediate;
2104  pixdCLIntermediate = pixtemp;
2105 
2106  // Fifth step: Get horizontal line
2107  // pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
2108  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
2109 
2110  // Copy the horizontal line output to CPU buffer
2111  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
2112  CL_MAP_READ, true, true);
2113 
2114  return;
2115 }
2116 
2117 /*************************************************************************
2118  * HistogramRect
2119  * Otsu Thresholding Operations
2120  * histogramAllChannels is laid out as all channel 0, then all channel 1...
2121  * only supports 1 or 4 channels (bytes_per_pixel)
2122  ************************************************************************/
2123 int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
2124  int bytes_per_pixel, int bytes_per_line,
2125  int left, // always 0
2126  int top, // always 0
2127  int width, int height, int kHistogramSize,
2128  int *histogramAllChannels) {
2129  PERF_COUNT_START("HistogramRectOCL")
2130  cl_int clStatus;
2131  int retVal = 0;
2132  KernelEnv histKern;
2133  SetKernelEnv(&histKern);
2134  KernelEnv histRedKern;
2135  SetKernelEnv(&histRedKern);
2136  /* map imagedata to device as read only */
2137  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
2138  // coherent which we don't need.
2139  // faster option would be to allocate initial image buffer
2140  // using a garlic bus memory type
2141  cl_mem imageBuffer = clCreateBuffer(
2142  histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2143  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
2144  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
2145 
2146  /* setup work group size parameters */
2147  int block_size = 256;
2148  cl_uint numCUs;
2149  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
2150  sizeof(numCUs), &numCUs, NULL);
2151  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
2152 
2153  int requestedOccupancy = 10;
2154  int numWorkGroups = numCUs * requestedOccupancy;
2155  int numThreads = block_size * numWorkGroups;
2156  size_t local_work_size[] = {static_cast<size_t>(block_size)};
2157  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
2158  size_t red_global_work_size[] = {
2159  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
2160 
2161  /* map histogramAllChannels as write only */
2162  int numBins = kHistogramSize * bytes_per_pixel * numWorkGroups;
2163 
2164  cl_mem histogramBuffer = clCreateBuffer(
2165  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
2166  kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels,
2167  &clStatus);
2168  CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
2169 
2170  /* intermediate histogram buffer */
2171  int histRed = 256;
2172  int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
2173 
2174  cl_mem tmpHistogramBuffer =
2175  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
2176  tmpHistogramBins * sizeof(cl_uint), NULL, &clStatus);
2177  CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
2178 
2179  /* atomic sync buffer */
2180  int *zeroBuffer = new int[1];
2181  zeroBuffer[0] = 0;
2182  cl_mem atomicSyncBuffer = clCreateBuffer(
2183  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
2184  sizeof(cl_int), zeroBuffer, &clStatus);
2185  CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
2186  delete[] zeroBuffer;
2187  // Create kernel objects based on bytes_per_pixel
2188  if (bytes_per_pixel == 1) {
2189  histKern.mpkKernel = clCreateKernel(
2190  histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
2191  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
2192 
2193  histRedKern.mpkKernel =
2194  clCreateKernel(histRedKern.mpkProgram,
2195  "kernel_HistogramRectOneChannelReduction", &clStatus);
2196  CHECK_OPENCL(clStatus,
2197  "clCreateKernel kernel_HistogramRectOneChannelReduction");
2198  } else {
2199  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
2200  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
2201 
2202  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
2203  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
2204  }
2205 
2206  void *ptr;
2207 
2208  //Initialize tmpHistogramBuffer buffer
2209  ptr = clEnqueueMapBuffer(
2210  histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
2211  tmpHistogramBins * sizeof(cl_uint), 0, NULL, NULL, &clStatus);
2212  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
2213 
2214  memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
2215  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
2216  NULL, NULL);
2217 
2218  /* set kernel 1 arguments */
2219  clStatus =
2220  clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
2221  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2222  cl_uint numPixels = width*height;
2223  clStatus =
2224  clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
2225  CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
2226  clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
2227  &tmpHistogramBuffer);
2228  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
2229 
2230  /* set kernel 2 arguments */
2231  int n = numThreads/bytes_per_pixel;
2232  clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
2233  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2234  clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem),
2235  &tmpHistogramBuffer);
2236  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
2237  clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem),
2238  &histogramBuffer);
2239  CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
2240 
2241  /* launch histogram */
2242 PERF_COUNT_SUB("before")
2243 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
2244  NULL, global_work_size, local_work_size, 0,
2245  NULL, NULL);
2246 CHECK_OPENCL(clStatus,
2247  "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
2248 clFinish(histKern.mpkCmdQueue);
2249 if (clStatus != 0) {
2250  retVal = -1;
2251  }
2252  /* launch histogram */
2253  clStatus = clEnqueueNDRangeKernel(
2254  histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, NULL,
2255  red_global_work_size, local_work_size, 0, NULL, NULL);
2256  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
2257  clFinish( histRedKern.mpkCmdQueue );
2258  if (clStatus != 0) {
2259  retVal = -1;
2260  }
2261  PERF_COUNT_SUB("redKernel")
2262 
2263  /* map results back from gpu */
2264  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
2265  CL_MAP_READ, 0,
2266  kHistogramSize * bytes_per_pixel * sizeof(int), 0,
2267  NULL, NULL, &clStatus);
2268  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
2269  if (clStatus != 0) {
2270  retVal = -1;
2271  }
2272  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
2273  NULL, NULL);
2274 
2275  clReleaseMemObject(histogramBuffer);
2276  clReleaseMemObject(imageBuffer);
2277 PERF_COUNT_SUB("after")
2279 return retVal;
2280 }
2281 
2282 /*************************************************************************
2283  * Threshold the rectangle, taking everything except the image buffer pointer
2284  * from the class, using thresholds/hi_values to the output IMAGE.
2285  * only supports 1 or 4 channels
2286  ************************************************************************/
2287 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData,
2288  int bytes_per_pixel, int bytes_per_line,
2289  int *thresholds, int *hi_values,
2290  Pix **pix, int height, int width,
2291  int top, int left) {
2292  PERF_COUNT_START("ThresholdRectToPixOCL")
2293  int retVal = 0;
2294  /* create pix result buffer */
2295  *pix = pixCreate(width, height, 1);
2296  uinT32 *pixData = pixGetData(*pix);
2297  int wpl = pixGetWpl(*pix);
2298  int pixSize = wpl * height * sizeof(uinT32); // number of pixels
2299 
2300  cl_int clStatus;
2301  KernelEnv rEnv;
2302  SetKernelEnv(&rEnv);
2303 
2304  /* setup work group size parameters */
2305  int block_size = 256;
2306  cl_uint numCUs = 6;
2307  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
2308  sizeof(numCUs), &numCUs, NULL);
2309  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
2310 
2311  int requestedOccupancy = 10;
2312  int numWorkGroups = numCUs * requestedOccupancy;
2313  int numThreads = block_size * numWorkGroups;
2314  size_t local_work_size[] = {(size_t)block_size};
2315  size_t global_work_size[] = {(size_t)numThreads};
2316 
2317  /* map imagedata to device as read only */
2318  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
2319  // coherent which we don't need.
2320  // faster option would be to allocate initial image buffer
2321  // using a garlic bus memory type
2322  cl_mem imageBuffer = clCreateBuffer(
2323  rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2324  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
2325  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
2326 
2327  /* map pix as write only */
2328  pixThBuffer =
2329  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
2330  pixSize, pixData, &clStatus);
2331  CHECK_OPENCL(clStatus, "clCreateBuffer pix");
2332 
2333  /* map thresholds and hi_values */
2334  cl_mem thresholdsBuffer =
2335  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2336  bytes_per_pixel * sizeof(int), thresholds, &clStatus);
2337  CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
2338  cl_mem hiValuesBuffer =
2339  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2340  bytes_per_pixel * sizeof(int), hi_values, &clStatus);
2341  CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
2342 
2343  /* compile kernel */
2344  if (bytes_per_pixel == 4) {
2345  rEnv.mpkKernel =
2346  clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
2347  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
2348  } else {
2349  rEnv.mpkKernel = clCreateKernel(
2350  rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
2351  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
2352  }
2353 
2354  /* set kernel arguments */
2355  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
2356  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
2357  cl_uint numPixels = width * height;
2358  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
2359  CHECK_OPENCL(clStatus, "clSetKernelArg height");
2360  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
2361  CHECK_OPENCL(clStatus, "clSetKernelArg width");
2362  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
2363  CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
2364  clStatus =
2365  clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
2366  CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
2367  clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
2368  CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
2369  clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
2370  CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
2371 
2372  /* launch kernel & wait */
2373  PERF_COUNT_SUB("before")
2374  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
2375  NULL, global_work_size, local_work_size,
2376  0, NULL, NULL);
2377  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
2378  clFinish(rEnv.mpkCmdQueue);
2379  PERF_COUNT_SUB("kernel")
2380  if (clStatus != 0) {
2381  printf("Setting return value to -1\n");
2382  retVal = -1;
2383  }
2384  /* map results back from gpu */
2385  void *ptr =
2386  clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
2387  pixSize, 0, NULL, NULL, &clStatus);
2388  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
2389  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL,
2390  NULL);
2391 
2392  clReleaseMemObject(imageBuffer);
2393  clReleaseMemObject(thresholdsBuffer);
2394  clReleaseMemObject(hiValuesBuffer);
2395 
2396  PERF_COUNT_SUB("after")
2398  return retVal;
2399 }
2400 
2401 
2402 
2403 /******************************************************************************
2404  * Data Types for Device Selection
2405  *****************************************************************************/
2406 
2407 typedef struct _TessScoreEvaluationInputData {
2408  int height;
2409  int width;
2410  int numChannels;
2411  unsigned char *imageData;
2412  Pix *pix;
2413 } TessScoreEvaluationInputData;
2414 
2415 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
2416  srand(1);
2417  // 8.5x11 inches @ 300dpi rounded to clean multiples
2418  int height = 3328; // %256
2419  int width = 2560; // %512
2420  int numChannels = 4;
2421  input->height = height;
2422  input->width = width;
2423  input->numChannels = numChannels;
2424  unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
2425  input->imageData = (unsigned char *) &imageData4[0];
2426 
2427  // zero out image
2428  unsigned char pixelWhite[4] = { 0, 0, 0, 255};
2429  unsigned char pixelBlack[4] = {255, 255, 255, 255};
2430  for (int p = 0; p < height*width; p++) {
2431  //unsigned char tmp[4] = imageData4[0];
2432  imageData4[p][0] = pixelWhite[0];
2433  imageData4[p][1] = pixelWhite[1];
2434  imageData4[p][2] = pixelWhite[2];
2435  imageData4[p][3] = pixelWhite[3];
2436  }
2437  // random lines to be eliminated
2438  int maxLineWidth = 64; // pixels wide
2439  int numLines = 10;
2440  // vertical lines
2441  for (int i = 0; i < numLines; i++) {
2442  int lineWidth = rand()%maxLineWidth;
2443  int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
2444  //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
2445  for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2446  for (int col = 0; col < height; col++) {
2447  //imageData4[row*width+col] = pixelBlack;
2448  imageData4[row*width+col][0] = pixelBlack[0];
2449  imageData4[row*width+col][1] = pixelBlack[1];
2450  imageData4[row*width+col][2] = pixelBlack[2];
2451  imageData4[row*width+col][3] = pixelBlack[3];
2452  }
2453  }
2454  }
2455  // horizontal lines
2456  for (int i = 0; i < numLines; i++) {
2457  int lineWidth = rand()%maxLineWidth;
2458  int horLinePos = lineWidth + rand()%(height-2*lineWidth);
2459  //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
2460  for (int row = 0; row < width; row++) {
2461  for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2462  //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
2463  //imageData4[row*width+col] = pixelBlack;
2464  imageData4[row*width+col][0] = pixelBlack[0];
2465  imageData4[row*width+col][1] = pixelBlack[1];
2466  imageData4[row*width+col][2] = pixelBlack[2];
2467  imageData4[row*width+col][3] = pixelBlack[3];
2468  }
2469  }
2470  }
2471  // spots (noise, squares)
2472  float fractionBlack = 0.1; // how much of the image should be blackened
2473  int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
2474  for (int i = 0; i < numSpots; i++) {
2475  int lineWidth = rand()%maxLineWidth;
2476  int col = lineWidth + rand()%(width-2*lineWidth);
2477  int row = lineWidth + rand()%(height-2*lineWidth);
2478  //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
2479  for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
2480  for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
2481  //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2482  //imageData4[row*width+col] = pixelBlack;
2483  imageData4[r*width+c][0] = pixelBlack[0];
2484  imageData4[r*width+c][1] = pixelBlack[1];
2485  imageData4[r*width+c][2] = pixelBlack[2];
2486  imageData4[r*width+c][3] = pixelBlack[3];
2487  }
2488  }
2489  }
2490 
2491  input->pix = pixCreate(input->width, input->height, 1);
2492 }
2493 
2494 typedef struct _TessDeviceScore {
2495  float time; // small time means faster device
2496  bool clError; // were there any opencl errors
2497  bool valid; // was the correct response generated
2498 } TessDeviceScore;
2499 
2500 /******************************************************************************
2501  * Micro Benchmarks for Device Selection
2502  *****************************************************************************/
2503 
2504 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2505  double time = 0;
2506 #if ON_WINDOWS
2507  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2508  QueryPerformanceFrequency(&freq);
2509 #elif ON_APPLE
2510  mach_timebase_info_data_t info = {0, 0};
2511  mach_timebase_info(&info);
2512  long long start, stop;
2513 #else
2514  timespec time_funct_start, time_funct_end;
2515 #endif
2516  // input data
2517  l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
2518 
2519  // function call
2520  if (type == DS_DEVICE_OPENCL_DEVICE) {
2521 #if ON_WINDOWS
2522  QueryPerformanceCounter(&time_funct_start);
2523 #elif ON_APPLE
2524  start = mach_absolute_time();
2525 #else
2526  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2527 #endif
2528 
2529  OpenclDevice::gpuEnv = *env;
2530  int wpl = pixGetWpl(input.pix);
2531  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2532  wpl, NULL);
2533 #if ON_WINDOWS
2534  QueryPerformanceCounter(&time_funct_end);
2535  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2536 #elif ON_APPLE
2537  stop = mach_absolute_time();
2538  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2539 #else
2540  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2541  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2542 #endif
2543 
2544  } else {
2545 #if ON_WINDOWS
2546  QueryPerformanceCounter(&time_funct_start);
2547 #elif ON_APPLE
2548  start = mach_absolute_time();
2549 #else
2550  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2551 #endif
2552  Pix *pix = pixCreate(input.width, input.height, 32);
2553  l_uint32 *pixData = pixGetData(pix);
2554  int wpl = pixGetWpl(pix);
2555  //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
2556  //pixSetData(pix, output_gpu);
2557  int i, j;
2558  int idx = 0;
2559  for (i = 0; i < input.height ; i++) {
2560  for (j = 0; j < input.width; j++) {
2561  l_uint32 tiffword = tiffdata[i * input.width + j];
2562  l_int32 rval = ((tiffword) & 0xff);
2563  l_int32 gval = (((tiffword) >> 8) & 0xff);
2564  l_int32 bval = (((tiffword) >> 16) & 0xff);
2565  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2566  pixData[idx] = value;
2567  idx++;
2568  }
2569  }
2570 #if ON_WINDOWS
2571  QueryPerformanceCounter(&time_funct_end);
2572  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2573 #elif ON_APPLE
2574  stop = mach_absolute_time();
2575  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2576 #else
2577  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2578  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2579 #endif
2580  pixDestroy(&pix);
2581  }
2582 
2583 
2584  // cleanup
2585 
2586  return time;
2587 }
2588 
2589 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2590  double time;
2591 #if ON_WINDOWS
2592  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2593  QueryPerformanceFrequency(&freq);
2594 #elif ON_APPLE
2595  mach_timebase_info_data_t info = {0, 0};
2596  mach_timebase_info(&info);
2597  long long start, stop;
2598 #else
2599  timespec time_funct_start, time_funct_end;
2600 #endif
2601 
2602  unsigned char pixelHi = (unsigned char)255;
2603 
2604  int left = 0;
2605  int top = 0;
2606  int kHistogramSize = 256;
2607  int bytes_per_line = input.width*input.numChannels;
2608  int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
2609  int retVal = 0;
2610  // function call
2611  if (type == DS_DEVICE_OPENCL_DEVICE) {
2612 #if ON_WINDOWS
2613  QueryPerformanceCounter(&time_funct_start);
2614 #elif ON_APPLE
2615  start = mach_absolute_time();
2616 #else
2617  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2618 #endif
2619 
2620  OpenclDevice::gpuEnv = *env;
2621  int wpl = pixGetWpl(input.pix);
2622  retVal = OpenclDevice::HistogramRectOCL(
2623  input.imageData, input.numChannels, bytes_per_line, top, left,
2624  input.width, input.height, kHistogramSize, histogramAllChannels);
2625 
2626 #if ON_WINDOWS
2627  QueryPerformanceCounter(&time_funct_end);
2628  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2629 #elif ON_APPLE
2630  stop = mach_absolute_time();
2631  if (retVal == 0) {
2632  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2633  } else {
2634  time = FLT_MAX;
2635  }
2636 #else
2637  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2638  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2639 #endif
2640  } else {
2641  int *histogram = new int[kHistogramSize];
2642 #if ON_WINDOWS
2643  QueryPerformanceCounter(&time_funct_start);
2644 #elif ON_APPLE
2645  start = mach_absolute_time();
2646 #else
2647  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2648 #endif
2649  for (int ch = 0; ch < input.numChannels; ++ch) {
2650  tesseract::HistogramRect(input.pix, input.numChannels, left, top,
2651  input.width, input.height, histogram);
2652  }
2653 #if ON_WINDOWS
2654  QueryPerformanceCounter(&time_funct_end);
2655  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2656 #elif ON_APPLE
2657  stop = mach_absolute_time();
2658  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2659 #else
2660  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2661  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2662 #endif
2663  delete[] histogram;
2664  }
2665 
2666  // cleanup
2667  delete[] histogramAllChannels;
2668  return time;
2669 }
2670 
2671 //Reproducing the ThresholdRectToPix native version
2672 void ThresholdRectToPix_Native(const unsigned char* imagedata,
2673  int bytes_per_pixel,
2674  int bytes_per_line,
2675  const int* thresholds,
2676  const int* hi_values,
2677  Pix** pix) {
2678  int top = 0;
2679  int left = 0;
2680  int width = pixGetWidth(*pix);
2681  int height = pixGetHeight(*pix);
2682 
2683  *pix = pixCreate(width, height, 1);
2684  uinT32 *pixdata = pixGetData(*pix);
2685  int wpl = pixGetWpl(*pix);
2686  const unsigned char* srcdata = imagedata + top * bytes_per_line +
2687  left * bytes_per_pixel;
2688  for (int y = 0; y < height; ++y) {
2689  const uinT8 *linedata = srcdata;
2690  uinT32 *pixline = pixdata + y * wpl;
2691  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2692  bool white_result = true;
2693  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2694  if (hi_values[ch] >= 0 &&
2695  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2696  white_result = false;
2697  break;
2698  }
2699  }
2700  if (white_result)
2701  CLEAR_DATA_BIT(pixline, x);
2702  else
2703  SET_DATA_BIT(pixline, x);
2704  }
2705  srcdata += bytes_per_line;
2706  }
2707 }
2708 
2709 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2710  double time;
2711  int retVal = 0;
2712 #if ON_WINDOWS
2713  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2714  QueryPerformanceFrequency(&freq);
2715 #elif ON_APPLE
2716  mach_timebase_info_data_t info = {0, 0};
2717  mach_timebase_info(&info);
2718  long long start, stop;
2719 #else
2720  timespec time_funct_start, time_funct_end;
2721 #endif
2722 
2723  // input data
2724  unsigned char pixelHi = (unsigned char)255;
2725  int* thresholds = new int[4];
2726  thresholds[0] = pixelHi/2;
2727  thresholds[1] = pixelHi/2;
2728  thresholds[2] = pixelHi/2;
2729  thresholds[3] = pixelHi/2;
2730  int *hi_values = new int[4];
2731  thresholds[0] = pixelHi;
2732  thresholds[1] = pixelHi;
2733  thresholds[2] = pixelHi;
2734  thresholds[3] = pixelHi;
2735  //Pix* pix = pixCreate(width, height, 1);
2736  int top = 0;
2737  int left = 0;
2738  int bytes_per_line = input.width*input.numChannels;
2739 
2740  // function call
2741  if (type == DS_DEVICE_OPENCL_DEVICE) {
2742 #if ON_WINDOWS
2743  QueryPerformanceCounter(&time_funct_start);
2744 #elif ON_APPLE
2745  start = mach_absolute_time();
2746 #else
2747  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2748 #endif
2749 
2750  OpenclDevice::gpuEnv = *env;
2751  int wpl = pixGetWpl(input.pix);
2752  retVal = OpenclDevice::ThresholdRectToPixOCL(
2753  input.imageData, input.numChannels, bytes_per_line, thresholds,
2754  hi_values, &input.pix, input.height, input.width, top, left);
2755 
2756 #if ON_WINDOWS
2757  QueryPerformanceCounter(&time_funct_end);
2758  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2759 #elif ON_APPLE
2760  stop = mach_absolute_time();
2761  if (retVal == 0) {
2762  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2763  ;
2764  } else {
2765  time = FLT_MAX;
2766  }
2767 
2768 #else
2769  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2770  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2771 #endif
2772  } else {
2773 
2774 
2775  tesseract::ImageThresholder thresholder;
2776  thresholder.SetImage( input.pix );
2777 #if ON_WINDOWS
2778  QueryPerformanceCounter(&time_funct_start);
2779 #elif ON_APPLE
2780  start = mach_absolute_time();
2781 #else
2782  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2783 #endif
2784  ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
2785  thresholds, hi_values, &input.pix );
2786 
2787 #if ON_WINDOWS
2788  QueryPerformanceCounter(&time_funct_end);
2789  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2790 #elif ON_APPLE
2791  stop = mach_absolute_time();
2792  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2793 #else
2794  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2795  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2796 #endif
2797  }
2798 
2799  // cleanup
2800  delete[] thresholds;
2801  delete[] hi_values;
2802  return time;
2803 }
2804 
2805 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2806 
2807  double time = 0;
2808 #if ON_WINDOWS
2809  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2810  QueryPerformanceFrequency(&freq);
2811 #elif ON_APPLE
2812  mach_timebase_info_data_t info = {0, 0};
2813  mach_timebase_info(&info);
2814  long long start, stop;
2815 #else
2816  timespec time_funct_start, time_funct_end;
2817 #endif
2818 
2819  // input data
2820  int resolution = 300;
2821  int wpl = pixGetWpl(input.pix);
2822  int kThinLineFraction = 20; // tess constant
2823  int kMinLineLengthFraction = 4; // tess constant
2824  int max_line_width = resolution / kThinLineFraction;
2825  int min_line_length = resolution / kMinLineLengthFraction;
2826  int closing_brick = max_line_width / 3;
2827 
2828  // function call
2829  if (type == DS_DEVICE_OPENCL_DEVICE) {
2830 #if ON_WINDOWS
2831  QueryPerformanceCounter(&time_funct_start);
2832 #elif ON_APPLE
2833  start = mach_absolute_time();
2834 #else
2835  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2836 #endif
2837  Pix *src_pix = input.pix;
2838  OpenclDevice::gpuEnv = *env;
2839  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2840  Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
2841  OpenclDevice::pixGetLinesCL(
2842  NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2843  closing_brick, closing_brick, max_line_width, max_line_width,
2844  min_line_length, min_line_length);
2845 
2846  OpenclDevice::releaseMorphCLBuffers();
2847 
2848 #if ON_WINDOWS
2849  QueryPerformanceCounter(&time_funct_end);
2850  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2851 #elif ON_APPLE
2852  stop = mach_absolute_time();
2853  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2854 #else
2855  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2856  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2857 #endif
2858  } else {
2859 #if ON_WINDOWS
2860  QueryPerformanceCounter(&time_funct_start);
2861 #elif ON_APPLE
2862  start = mach_absolute_time();
2863 #else
2864  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2865 #endif
2866 
2867  // native serial code
2868  Pix *src_pix = input.pix;
2869  Pix *pix_closed =
2870  pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
2871  Pix *pix_solid =
2872  pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
2873  Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
2874  pixDestroy(&pix_solid);
2875  Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
2876  Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
2877  pixDestroy(&pix_hollow);
2878 
2879 #if ON_WINDOWS
2880  QueryPerformanceCounter(&time_funct_end);
2881  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2882 #elif ON_APPLE
2883  stop = mach_absolute_time();
2884  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2885 #else
2886  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2887  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2888 #endif
2889  }
2890 
2891  return time;
2892 }
2893 
2894 
2895 
2896 /******************************************************************************
2897  * Device Selection
2898  *****************************************************************************/
2899 
2900 #include "stdlib.h"
2901 
2902 // encode score object as byte string
2903 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
2904  *serializedScoreSize = sizeof(TessDeviceScore);
2905  *serializedScore = new unsigned char[*serializedScoreSize];
2906  memcpy(*serializedScore, device->score, *serializedScoreSize);
2907  return DS_SUCCESS;
2908 }
2909 
2910 // parses byte string and stores in score object
2911 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
2912  // check that serializedScoreSize == sizeof(TessDeviceScore);
2913  device->score = new TessDeviceScore;
2914  memcpy(device->score, serializedScore, serializedScoreSize);
2915  return DS_SUCCESS;
2916 }
2917 
2918 ds_status releaseScore(void *score) {
2919  delete (TessDeviceScore *)score;
2920  return DS_SUCCESS;
2921 }
2922 
2923 // evaluate devices
2924 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
2925  // overwrite statuc gpuEnv w/ current device
2926  // so native opencl calls can be used; they use static gpuEnv
2927  printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
2928  GPUEnv *env = NULL;
2929  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2930  env = new GPUEnv;
2931  //printf("[DS] populating tmp GPUEnv from device\n");
2932  populateGPUEnvFromDevice( env, device->oclDeviceID);
2933  env->mnFileCount = 0; //argc;
2934  env->mnKernelCount = 0UL;
2935  //printf("[DS] compiling kernels for tmp GPUEnv\n");
2936  OpenclDevice::gpuEnv = *env;
2937  OpenclDevice::CompileKernelFile(env, "");
2938  }
2939 
2940  TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
2941 
2942  // pixReadTiff
2943  double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
2944 
2945  // HistogramRect
2946  double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
2947 
2948  // ThresholdRectToPix
2949  double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
2950 
2951  // getLineMasks
2952  double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
2953 
2954 
2955  // weigh times (% of cpu time)
2956  // these weights should be the % execution time that the native cpu code took
2957  float composeRGBPixelWeight = 1.2f;
2958  float histogramRectWeight = 2.4f;
2959  float thresholdRectToPixWeight = 4.5f;
2960  float getLineMasksMorphWeight = 5.0f;
2961 
2962  float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2963  histogramRectWeight * histogramRectTime +
2964  thresholdRectToPixWeight * thresholdRectToPixTime +
2965  getLineMasksMorphWeight * getLineMasksMorphTime;
2966  device->score = new TessDeviceScore;
2967  ((TessDeviceScore *)device->score)->time = weightedTime;
2968 
2969  printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
2970  printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
2971  printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
2972  printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
2973  printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
2974  printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
2975  return DS_SUCCESS;
2976 }
2977 
2978 // initial call to select device
2979 ds_device OpenclDevice::getDeviceSelection( ) {
2980  if (!deviceIsSelected) {
2981  PERF_COUNT_START("getDeviceSelection")
2982  // check if opencl is available at runtime
2983  if (1 == LoadOpencl()) {
2984  // opencl is available
2985  // PERF_COUNT_SUB("LoadOpencl")
2986  // setup devices
2987  ds_status status;
2988  ds_profile *profile;
2989  status = initDSProfile(&profile, "v0.1");
2990  PERF_COUNT_SUB("initDSProfile")
2991  // try reading scores from file
2992  const char *fileName = "tesseract_opencl_profile_devices.dat";
2993  status = readProfileFromFile(profile, deserializeScore, fileName);
2994  if (status != DS_SUCCESS) {
2995  // need to run evaluation
2996  printf("[DS] Profile file not available (%s); performing profiling.\n",
2997  fileName);
2998 
2999  // create input data
3000  TessScoreEvaluationInputData input;
3001  populateTessScoreEvaluationInputData(&input);
3002  // PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
3003  // perform evaluations
3004  unsigned int numUpdates;
3005  status = profileDevices(profile, DS_EVALUATE_ALL,
3006  evaluateScoreForDevice, &input, &numUpdates);
3007  PERF_COUNT_SUB("profileDevices")
3008  // write scores to file
3009  if (status == DS_SUCCESS) {
3010  status = writeProfileToFile(profile, serializeScore, fileName);
3011  PERF_COUNT_SUB("writeProfileToFile")
3012  if (status == DS_SUCCESS) {
3013  printf("[DS] Scores written to file (%s).\n", fileName);
3014  } else {
3015  printf(
3016  "[DS] Error saving scores to file (%s); scores not written to "
3017  "file.\n",
3018  fileName);
3019  }
3020  } else {
3021  printf(
3022  "[DS] Unable to evaluate performance; scores not written to "
3023  "file.\n");
3024  }
3025  } else {
3026  PERF_COUNT_SUB("readProfileFromFile")
3027  printf("[DS] Profile read from file (%s).\n", fileName);
3028  }
3029 
3030  // we now have device scores either from file or evaluation
3031  // select fastest using custom Tesseract selection algorithm
3032  float bestTime = FLT_MAX; // begin search with worst possible time
3033  int bestDeviceIdx = -1;
3034  for (int d = 0; d < profile->numDevices; d++) {
3035  ds_device device = profile->devices[d];
3036  TessDeviceScore score = *(TessDeviceScore *)device.score;
3037 
3038  float time = score.time;
3039  printf("[DS] Device[%i] %i:%s score is %f\n", d + 1, device.type,
3040  device.oclDeviceName, time);
3041  if (time < bestTime) {
3042  bestTime = time;
3043  bestDeviceIdx = d;
3044  }
3045  }
3046  printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
3047  profile->devices[bestDeviceIdx].oclDeviceName,
3048  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
3049  ? "OpenCL"
3050  : "Native");
3051  // cleanup
3052  // TODO: call destructor for profile object?
3053 
3054  bool overridden = false;
3055  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
3056  if (overrideDeviceStr != NULL) {
3057  int overrideDeviceIdx = atoi(overrideDeviceStr);
3058  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
3059  printf(
3060  "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
3061  "%i)\n",
3062  overrideDeviceStr, overrideDeviceIdx);
3063  bestDeviceIdx = overrideDeviceIdx - 1;
3064  overridden = true;
3065  } else {
3066  printf(
3067  "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
3068  "valid devices).\n",
3069  overrideDeviceStr, profile->numDevices);
3070  }
3071  }
3072 
3073  if (overridden) {
3074  printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
3075  profile->devices[bestDeviceIdx].oclDeviceName,
3076  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
3077  ? "OpenCL"
3078  : "Native");
3079  }
3080  selectedDevice = profile->devices[bestDeviceIdx];
3081  // cleanup
3082  releaseDSProfile(profile, releaseScore);
3083  } else {
3084  // opencl isn't available at runtime, select native cpu device
3085  printf("[DS] OpenCL runtime not available.\n");
3086  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
3087  selectedDevice.oclDeviceName = "(null)";
3088  selectedDevice.score = NULL;
3089  selectedDevice.oclDeviceID = NULL;
3090  selectedDevice.oclDriverVersion = NULL;
3091  }
3092  deviceIsSelected = true;
3093  PERF_COUNT_SUB("select from Profile")
3095  }
3096  // PERF_COUNT_END
3097  return selectedDevice;
3098 }
3099 
3100 
3101 bool OpenclDevice::selectedDeviceIsOpenCL() {
3102  ds_device device = getDeviceSelection();
3103  return (device.type == DS_DEVICE_OPENCL_DEVICE);
3104 }
3105 
3106 bool OpenclDevice::selectedDeviceIsNativeCPU() {
3107  ds_device device = getDeviceSelection();
3108  return (device.type == DS_DEVICE_NATIVE_CPU);
3109 }
3110 
3122 #define SET_DATA_BYTE(pdata, n, val) \
3123  (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))
3124 
3125 Pix *OpenclDevice::pixConvertRGBToGrayOCL(Pix *srcPix, // 32-bit source
3126  float rwt, float gwt, float bwt) {
3127  PERF_COUNT_START("pixConvertRGBToGrayOCL")
3128  Pix *dstPix; // 8-bit destination
3129 
3130  if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL;
3131 
3132  if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) {
3133  // magic numbers from leptonica
3134  rwt = 0.3;
3135  gwt = 0.5;
3136  bwt = 0.2;
3137  }
3138  // normalize
3139  float sum = rwt + gwt + bwt;
3140  rwt /= sum;
3141  gwt /= sum;
3142  bwt /= sum;
3143 
3144  // source pix
3145  int w, h;
3146  pixGetDimensions(srcPix, &w, &h, NULL);
3147  // printf("Image is %i x %i\n", w, h);
3148  unsigned int *srcData = pixGetData(srcPix);
3149  int srcWPL = pixGetWpl(srcPix);
3150  int srcSize = srcWPL * h * sizeof(unsigned int);
3151 
3152  // destination pix
3153  if ((dstPix = pixCreate(w, h, 8)) == NULL) return NULL;
3154  pixCopyResolution(dstPix, srcPix);
3155  unsigned int *dstData = pixGetData(dstPix);
3156  int dstWPL = pixGetWpl(dstPix);
3157  int dstWords = dstWPL * h;
3158  int dstSize = dstWords * sizeof(unsigned int);
3159  // printf("dstSize = %i\n", dstSize);
3160  PERF_COUNT_SUB("pix setup")
3161 
3162  // opencl objects
3163  cl_int clStatus;
3164  KernelEnv kEnv;
3165  SetKernelEnv(&kEnv);
3166 
3167  // source buffer
3168  cl_mem srcBuffer =
3169  clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
3170  srcSize, srcData, &clStatus);
3171  CHECK_OPENCL(clStatus, "clCreateBuffer srcBuffer");
3172 
3173  // destination buffer
3174  cl_mem dstBuffer =
3175  clCreateBuffer(kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
3176  dstSize, dstData, &clStatus);
3177  CHECK_OPENCL(clStatus, "clCreateBuffer dstBuffer");
3178 
3179  // setup work group size parameters
3180  int block_size = 256;
3181  int numWorkGroups = ((h * w + block_size - 1) / block_size);
3182  int numThreads = block_size * numWorkGroups;
3183  size_t local_work_size[] = {static_cast<size_t>(block_size)};
3184  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
3185  // printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h);
3186 
3187  /* compile kernel */
3188  kEnv.mpkKernel =
3189  clCreateKernel(kEnv.mpkProgram, "kernel_RGBToGray", &clStatus);
3190  CHECK_OPENCL(clStatus, "clCreateKernel kernel_RGBToGray");
3191 
3192  /* set kernel arguments */
3193  clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem), &srcBuffer);
3194  CHECK_OPENCL(clStatus, "clSetKernelArg srcBuffer");
3195  clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem), &dstBuffer);
3196  CHECK_OPENCL(clStatus, "clSetKernelArg dstBuffer");
3197  clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(int), &srcWPL);
3198  CHECK_OPENCL(clStatus, "clSetKernelArg srcWPL");
3199  clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(int), &dstWPL);
3200  CHECK_OPENCL(clStatus, "clSetKernelArg dstWPL");
3201  clStatus = clSetKernelArg(kEnv.mpkKernel, 4, sizeof(int), &h);
3202  CHECK_OPENCL(clStatus, "clSetKernelArg height");
3203  clStatus = clSetKernelArg(kEnv.mpkKernel, 5, sizeof(int), &w);
3204  CHECK_OPENCL(clStatus, "clSetKernelArg width");
3205  clStatus = clSetKernelArg(kEnv.mpkKernel, 6, sizeof(float), &rwt);
3206  CHECK_OPENCL(clStatus, "clSetKernelArg rwt");
3207  clStatus = clSetKernelArg(kEnv.mpkKernel, 7, sizeof(float), &gwt);
3208  CHECK_OPENCL(clStatus, "clSetKernelArg gwt");
3209  clStatus = clSetKernelArg(kEnv.mpkKernel, 8, sizeof(float), &bwt);
3210  CHECK_OPENCL(clStatus, "clSetKernelArg bwt");
3211 
3212  /* launch kernel & wait */
3213  PERF_COUNT_SUB("before")
3214  clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
3215  NULL, global_work_size, local_work_size,
3216  0, NULL, NULL);
3217  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray");
3218  clFinish(kEnv.mpkCmdQueue);
3219  PERF_COUNT_SUB("kernel")
3220 
3221  /* map results back from gpu */
3222  void *ptr =
3223  clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0,
3224  dstSize, 0, NULL, NULL, &clStatus);
3225  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer dstBuffer");
3226  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL,
3227  NULL);
3228 
3229 #if 0
3230  // validate: compute on cpu
3231  Pix *cpuPix = pixCreate(w, h, 8);
3232  pixCopyResolution(cpuPix, srcPix);
3233  unsigned int *cpuData = pixGetData(cpuPix);
3234  int cpuWPL = pixGetWpl(cpuPix);
3235  unsigned int *cpuLine, *srcLine;
3236  int i, j;
3237  for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) {
3238  for (j = 0; j < w; j++) {
3239  unsigned int word = *(srcLine + j);
3240  int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) +
3241  gwt * ((word >> L_GREEN_SHIFT) & 0xff) +
3242  bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5);
3243  SET_DATA_BYTE(cpuLine, j, val);
3244  }
3245  srcLine += srcWPL;
3246  cpuLine += cpuWPL;
3247  }
3248 
3249  // validate: compare
3250  printf("converted 32-bit -> 8-bit image\n");
3251  for (int row = 0; row < h; row++) {
3252  for (int col = 0; col < w; col++) {
3253  int idx = row*w + col;
3254  unsigned int srcVal = srcData[idx];
3255  unsigned char cpuVal = ((unsigned char *)cpuData)[idx];
3256  unsigned char oclVal = ((unsigned char *)dstData)[idx];
3257  if (srcVal > 0) {
3258  printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal);
3259  }
3260  }
3261  //printf("\n");
3262  }
3263 #endif
3264  // release opencl objects
3265  clReleaseMemObject(srcBuffer);
3266  clReleaseMemObject(dstBuffer);
3267 
3269  // success
3270  return dstPix;
3271 }
3272 #endif
#define TRUE
Definition: capi.h:45
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
const int kHistogramSize
Definition: otsuthr.h:27
#define PERF_COUNT_END
unsigned int uinT32
Definition: host.h:103
unsigned char uinT8
Definition: host.h:99
#define FALSE
Definition: capi.h:46
#define PERF_COUNT_SUB(SUB)
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:151
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41
#define PERF_COUNT_START(FUNCT_NAME)
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
const char * kernel_src
Definition: oclkernels.h:19