tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
oclkernels.h
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 
11 #ifndef TESSERACT_OPENCL_OCLKERNELS_H_
12 #define TESSERACT_OPENCL_OCLKERNELS_H_
13 
14 #ifndef USE_EXTERNAL_KERNEL
15 #define KERNEL(...) #__VA_ARGS__ "\n"
16 // Double precision is a default of spreadsheets
17 // cl_khr_fp64: Khronos extension
18 // cl_amd_fp64: AMD extension
19 // use build option outside to define fp_t
21 const char *kernel_src = KERNEL(
22 \n#ifdef KHR_DP_EXTENSION\n
23 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
24 \n#elif AMD_DP_EXTENSION\n
25 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
26 \n#else\n
27 \n#endif\n
28 __kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
29 {
30  int i = get_global_id(1);
31  int j = get_global_id(0);
32  int tiffword,rval,gval,bval;
33 
34  //Ignore the excess
35  if ((i >= h) || (j >= w))
36  return;
37 
38  tiffword = tiffdata[i * w + j];
39  rval = ((tiffword) & 0xff);
40  gval = (((tiffword) >> 8) & 0xff);
41  bval = (((tiffword) >> 16) & 0xff);
42  output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
43 }
44 )
45 
46 KERNEL(
47 \n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
48  const int wpl, const int h)
49 {
50  const unsigned int row = get_global_id(1);
51  const unsigned int col = get_global_id(0);
52  const unsigned int pos = row * wpl + col;
53 
54  //Ignore the execss
55  if (row >= h || col >= wpl)
56  return;
57 
58  *(dword + pos) &= ~(*(sword + pos));
59 }\n
60 )
61 
62 KERNEL(
63 \n__kernel void pixSubtract(__global int *dword, __global int *sword,
64  const int wpl, const int h, __global int *outword)
65 {
66  const unsigned int row = get_global_id(1);
67  const unsigned int col = get_global_id(0);
68  const unsigned int pos = row * wpl + col;
69 
70  //Ignore the execss
71  if (row >= h || col >= wpl)
72  return;
73 
74  *(outword + pos) = *(dword + pos) & ~(*(sword + pos));
75 }\n
76 )
77 
78 KERNEL(
79 \n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
80  const int wpl, const int h)
81 {
82  const unsigned int pos = get_global_id(0);
83  unsigned int prevword, nextword, currword,tempword;
84  unsigned int destword;
85  const int col = pos % wpl;
86 
87  //Ignore the execss
88  if (pos >= (wpl * h))
89  return;
90 
91 
92  currword = *(sword + pos);
93  destword = currword;
94 
95  //Handle boundary conditions
96  if(col==0)
97  prevword=0;
98  else
99  prevword = *(sword + pos - 1);
100 
101  if(col==(wpl - 1))
102  nextword=0;
103  else
104  nextword = *(sword + pos + 1);
105 
106  //Loop unrolled
107 
108  //1 bit to left and 1 bit to right
109  //Get the max value on LHS of every pixel
110  tempword = (prevword << (31)) | ((currword >> 1));
111  destword |= tempword;
112  //Get max value on RHS of every pixel
113  tempword = (currword << 1) | (nextword >> (31));
114  destword |= tempword;
115 
116  //2 bit to left and 2 bit to right
117  //Get the max value on LHS of every pixel
118  tempword = (prevword << (30)) | ((currword >> 2));
119  destword |= tempword;
120  //Get max value on RHS of every pixel
121  tempword = (currword << 2) | (nextword >> (30));
122  destword |= tempword;
123 
124 
125  *(dword + pos) = destword;
126 
127 }\n
128 )
129 
130 KERNEL(
131 \n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
132  const int wpl, const int h)
133 {
134  const int col = get_global_id(0);
135  const int row = get_global_id(1);
136  const unsigned int pos = row * wpl + col;
137  unsigned int tempword;
138  unsigned int destword;
139  int i;
140 
141  //Ignore the execss
142  if (row >= h || col >= wpl)
143  return;
144 
145  destword = *(sword + pos);
146 
147  //2 words above
148  i = (row - 2) < 0 ? row : (row - 2);
149  tempword = *(sword + i*wpl + col);
150  destword |= tempword;
151 
152  //1 word above
153  i = (row - 1) < 0 ? row : (row - 1);
154  tempword = *(sword + i*wpl + col);
155  destword |= tempword;
156 
157  //1 word below
158  i = (row >= (h - 1)) ? row : (row + 1);
159  tempword = *(sword + i*wpl + col);
160  destword |= tempword;
161 
162  //2 words below
163  i = (row >= (h - 2)) ? row : (row + 2);
164  tempword = *(sword + i*wpl + col);
165  destword |= tempword;
166 
167  *(dword + pos) = destword;
168 }\n
169 )
170 
171 KERNEL(
172 \n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
173 {
174  const int col = get_global_id(0);
175  const int row = get_global_id(1);
176  const unsigned int pos = row * wpl + col;
177  unsigned int parbitsxp, parbitsxn, nwords;
178  unsigned int destword, tempword, lastword, currword;
179  unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
180  int i, j, siter, eiter;
181 
182  //Ignore the execss
183  if (pos >= (wpl*h) || (xn < 1 && xp < 1))
184  return;
185 
186  currword = *(sword + pos);
187  destword = currword;
188 
189  parbitsxp = xp & 31;
190  parbitsxn = xn & 31;
191  nwords = xp >> 5;
192 
193  if (parbitsxp > 0)
194  nwords += 1;
195  else
196  parbitsxp = 31;
197 
198  siter = (col - nwords);
199  eiter = (col + nwords);
200 
201  //Get prev word
202  if (col==0)
203  firstword = 0x0;
204  else
205  firstword = *(sword + pos - 1);
206 
207  //Get next word
208  if (col == (wpl - 1))
209  secondword = 0x0;
210  else
211  secondword = *(sword + pos + 1);
212 
213  //Last partial bits on either side
214  for (i = 1; i <= parbitsxp; i++)
215  {
216  //Get the max value on LHS of every pixel
217  tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
218 
219  destword |= tempword;
220 
221  //Get max value on RHS of every pixel
222  tempword = (currword << i) | (secondword >> (32 - i));
223  destword |= tempword;
224  }
225 
226  //Return if halfwidth <= 1 word
227  if (nwords == 1)
228  {
229  if (xn == 32)
230  {
231  destword |= firstword;
232  }
233  if (xp == 32)
234  {
235  destword |= secondword;
236  }
237 
238  *(dword + pos) = destword;
239  return;
240  }
241 
242  if (siter < 0)
243  firstword = 0x0;
244  else
245  firstword = *(sword + row*wpl + siter);
246 
247  if (eiter >= wpl)
248  lastword = 0x0;
249  else
250  lastword = *(sword + row*wpl + eiter);
251 
252  for (i = 1; i < nwords; i++)
253  {
254  //Gets LHS words
255  if ((siter + i) < 0)
256  secondword = 0x0;
257  else
258  secondword = *(sword + row*wpl + siter + i);
259 
260  lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
261 
262  firstword = secondword;
263 
264  if ((siter + i + 1) < 0)
265  secondword = 0x0;
266  else
267  secondword = *(sword + row*wpl + siter + i + 1);
268 
269  lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
270 
271  //Gets RHS words
272  if ((eiter - i) >= wpl)
273  firstword = 0x0;
274  else
275  firstword = *(sword + row*wpl + eiter - i);
276 
277  rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
278 
279  lastword = firstword;
280  if ((eiter - i - 1) >= wpl)
281  firstword = 0x0;
282  else
283  firstword = *(sword + row*wpl + eiter - i - 1);
284 
285  rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
286 
287  for (j = 1; j < 32; j++)
288  {
289  //OR LHS full words
290  tempword = (lprevword << j) | (lnextword >> (32 - j));
291  destword |= tempword;
292 
293  //OR RHS full words
294  tempword = (rprevword << j) | (rnextword >> (32 - j));
295  destword |= tempword;
296  }
297 
298  destword |= lprevword;
299  destword |= lnextword;
300  destword |= rprevword;
301  destword |= rnextword;
302 
303  lastword = firstword;
304  firstword = secondword;
305  }
306 
307  *(dword + pos) = destword;
308 }\n
309 )
310 
311 KERNEL(
312 \n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
313  const int halfwidth,
314  const int wpl, const int h,
315  const char isEven)
316 {
317  const int col = get_global_id(0);
318  const int row = get_global_id(1);
319  const unsigned int pos = row * wpl + col;
320  unsigned int prevword, nextword, currword,tempword;
321  unsigned int destword;
322  int i;
323 
324  //Ignore the execss
325  if (pos >= (wpl * h))
326  return;
327 
328  currword = *(sword + pos);
329  destword = currword;
330 
331  //Handle boundary conditions
332  if(col==0)
333  prevword=0;
334  else
335  prevword = *(sword + pos - 1);
336 
337  if(col==(wpl - 1))
338  nextword=0;
339  else
340  nextword = *(sword + pos + 1);
341 
342  for (i = 1; i <= halfwidth; i++)
343  {
344  //Get the max value on LHS of every pixel
345  if (i == halfwidth && isEven)
346  {
347  tempword = 0x0;
348  }
349  else
350  {
351  tempword = (prevword << (32-i)) | ((currword >> i));
352  }
353 
354  destword |= tempword;
355 
356  //Get max value on RHS of every pixel
357  tempword = (currword << i) | (nextword >> (32 - i));
358 
359  destword |= tempword;
360  }
361 
362  *(dword + pos) = destword;
363 }\n
364 )
365 
366 KERNEL(
367 \n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
368  const int yp,
369  const int wpl, const int h,
370  const int yn)
371 {
372  const int col = get_global_id(0);
373  const int row = get_global_id(1);
374  const unsigned int pos = row * wpl + col;
375  unsigned int tempword;
376  unsigned int destword;
377  int i, siter, eiter;
378 
379  //Ignore the execss
380  if (row >= h || col >= wpl)
381  return;
382 
383  destword = *(sword + pos);
384 
385  //Set start position and end position considering the boundary conditions
386  siter = (row - yn) < 0 ? 0 : (row - yn);
387  eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
388 
389  for (i = siter; i <= eiter; i++)
390  {
391  tempword = *(sword + i*wpl + col);
392 
393  destword |= tempword;
394  }
395 
396  *(dword + pos) = destword;
397 }\n
398 )
399 
400 KERNEL(
401 \n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
402  const int wpl, const int h)
403 {
404  const unsigned int pos = get_global_id(0);
405  unsigned int prevword, nextword, currword,tempword;
406  unsigned int destword;
407  const int col = pos % wpl;
408 
409  //Ignore the execss
410  if (pos >= (wpl * h))
411  return;
412 
413  currword = *(sword + pos);
414  destword = currword;
415 
416  //Handle boundary conditions
417  if(col==0)
418  prevword=0xffffffff;
419  else
420  prevword = *(sword + pos - 1);
421 
422  if(col==(wpl - 1))
423  nextword=0xffffffff;
424  else
425  nextword = *(sword + pos + 1);
426 
427  //Loop unrolled
428 
429  //1 bit to left and 1 bit to right
430  //Get the min value on LHS of every pixel
431  tempword = (prevword << (31)) | ((currword >> 1));
432  destword &= tempword;
433  //Get min value on RHS of every pixel
434  tempword = (currword << 1) | (nextword >> (31));
435  destword &= tempword;
436 
437  //2 bit to left and 2 bit to right
438  //Get the min value on LHS of every pixel
439  tempword = (prevword << (30)) | ((currword >> 2));
440  destword &= tempword;
441  //Get min value on RHS of every pixel
442  tempword = (currword << 2) | (nextword >> (30));
443  destword &= tempword;
444 
445 
446  *(dword + pos) = destword;
447 
448 }\n
449 )
450 
451 KERNEL(
452 \n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
453  const int wpl, const int h,
454  const int fwmask, const int lwmask)
455 {
456  const int col = get_global_id(0);
457  const int row = get_global_id(1);
458  const unsigned int pos = row * wpl + col;
459  unsigned int tempword;
460  unsigned int destword;
461  int i;
462 
463  //Ignore the execss
464  if (row >= h || col >= wpl)
465  return;
466 
467  destword = *(sword + pos);
468 
469  if (row < 2 || row >= (h - 2))
470  {
471  destword = 0x0;
472  }
473  else
474  {
475  //2 words above
476  //i = (row - 2) < 0 ? row : (row - 2);
477  i = (row - 2);
478  tempword = *(sword + i*wpl + col);
479  destword &= tempword;
480 
481  //1 word above
482  //i = (row - 1) < 0 ? row : (row - 1);
483  i = (row - 1);
484  tempword = *(sword + i*wpl + col);
485  destword &= tempword;
486 
487  //1 word below
488  //i = (row >= (h - 1)) ? row : (row + 1);
489  i = (row + 1);
490  tempword = *(sword + i*wpl + col);
491  destword &= tempword;
492 
493  //2 words below
494  //i = (row >= (h - 2)) ? row : (row + 2);
495  i = (row + 2);
496  tempword = *(sword + i*wpl + col);
497  destword &= tempword;
498 
499  if (col == 0)
500  {
501  destword &= fwmask;
502  }
503  if (col == (wpl - 1))
504  {
505  destword &= lwmask;
506  }
507  }
508 
509 
510  *(dword + pos) = destword;
511 }\n
512 )
513 
514 KERNEL(
515 \n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl,
516  const int h, const char isAsymmetric, const int rwmask, const int lwmask)
517 {
518  const int col = get_global_id(0);
519  const int row = get_global_id(1);
520  const unsigned int pos = row * wpl + col;
521  unsigned int parbitsxp, parbitsxn, nwords;
522  unsigned int destword, tempword, lastword, currword;
523  unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
524  int i, j, siter, eiter;
525 
526  //Ignore the execss
527  if (pos >= (wpl*h) || (xn < 1 && xp < 1))
528  return;
529 
530  currword = *(sword + pos);
531  destword = currword;
532 
533  parbitsxp = xp & 31;
534  parbitsxn = xn & 31;
535  nwords = xp >> 5;
536 
537  if (parbitsxp > 0)
538  nwords += 1;
539  else
540  parbitsxp = 31;
541 
542  siter = (col - nwords);
543  eiter = (col + nwords);
544 
545  //Get prev word
546  if (col==0)
547  firstword = 0xffffffff;
548  else
549  firstword = *(sword + pos - 1);
550 
551  //Get next word
552  if (col == (wpl - 1))
553  secondword = 0xffffffff;
554  else
555  secondword = *(sword + pos + 1);
556 
557  //Last partial bits on either side
558  for (i = 1; i <= parbitsxp; i++)
559  {
560  //Get the max value on LHS of every pixel
561  tempword = (firstword << (32-i)) | ((currword >> i));
562  destword &= tempword;
563 
564  //Get max value on RHS of every pixel
565  tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
566 
567  //tempword = (currword << i) | (secondword >> (32 - i));
568  destword &= tempword;
569  }
570 
571  //Return if halfwidth <= 1 word
572  if (nwords == 1)
573  {
574  if (xp == 32)
575  {
576  destword &= firstword;
577  }
578  if (xn == 32)
579  {
580  destword &= secondword;
581  }
582 
583  //Clear boundary pixels
584  if (isAsymmetric)
585  {
586  if (col == 0)
587  destword &= rwmask;
588  if (col == (wpl - 1))
589  destword &= lwmask;
590  }
591 
592  *(dword + pos) = destword;
593  return;
594  }
595 
596  if (siter < 0)
597  firstword = 0xffffffff;
598  else
599  firstword = *(sword + row*wpl + siter);
600 
601  if (eiter >= wpl)
602  lastword = 0xffffffff;
603  else
604  lastword = *(sword + row*wpl + eiter);
605 
606 
607  for (i = 1; i < nwords; i++)
608  {
609  //Gets LHS words
610  if ((siter + i) < 0)
611  secondword = 0xffffffff;
612  else
613  secondword = *(sword + row*wpl + siter + i);
614 
615  lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
616 
617  firstword = secondword;
618 
619  if ((siter + i + 1) < 0)
620  secondword = 0xffffffff;
621  else
622  secondword = *(sword + row*wpl + siter + i + 1);
623 
624  lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
625 
626  //Gets RHS words
627  if ((eiter - i) >= wpl)
628  firstword = 0xffffffff;
629  else
630  firstword = *(sword + row*wpl + eiter - i);
631 
632  rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
633 
634  lastword = firstword;
635  if ((eiter - i - 1) >= wpl)
636  firstword = 0xffffffff;
637  else
638  firstword = *(sword + row*wpl + eiter - i - 1);
639 
640  rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
641 
642  for (j = 0; j < 32; j++)
643  {
644  //OR LHS full words
645  tempword = (lprevword << j) | (lnextword >> (32 - j));
646  destword &= tempword;
647 
648  //OR RHS full words
649  tempword = (rprevword << j) | (rnextword >> (32 - j));
650  destword &= tempword;
651  }
652 
653  destword &= lprevword;
654  destword &= lnextword;
655  destword &= rprevword;
656  destword &= rnextword;
657 
658  lastword = firstword;
659  firstword = secondword;
660  }
661 
662  if (isAsymmetric)
663  {
664  //Clear boundary pixels
665  if (col < (nwords - 1))
666  destword = 0x0;
667  else if (col == (nwords - 1))
668  destword &= rwmask;
669  else if (col > (wpl - nwords))
670  destword = 0x0;
671  else if (col == (wpl - nwords))
672  destword &= lwmask;
673  }
674 
675  *(dword + pos) = destword;
676 }\n
677 )
678 
679 KERNEL(
680 \n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
681  const int halfwidth, const int wpl,
682  const int h, const char clearBoundPixH,
683  const int rwmask, const int lwmask,
684  const char isEven)
685 {
686  const int col = get_global_id(0);
687  const int row = get_global_id(1);
688  const unsigned int pos = row * wpl + col;
689  unsigned int prevword, nextword, currword,tempword, destword;
690  int i;
691 
692  //Ignore the execss
693  if (pos >= (wpl * h))
694  return;
695 
696  currword = *(sword + pos);
697  destword = currword;
698 
699  //Handle boundary conditions
700  if(col==0)
701  prevword=0xffffffff;
702  else
703  prevword = *(sword + pos - 1);
704 
705  if(col==(wpl - 1))
706  nextword=0xffffffff;
707  else
708  nextword = *(sword + pos + 1);
709 
710  for (i = 1; i <= halfwidth; i++)
711  {
712  //Get the min value on LHS of every pixel
713  tempword = (prevword << (32-i)) | ((currword >> i));
714 
715  destword &= tempword;
716 
717  //Get min value on RHS of every pixel
718  if (i == halfwidth && isEven)
719  {
720  tempword = 0xffffffff;
721  }
722  else
723  {
724  tempword = (currword << i) | (nextword >> (32 - i));
725  }
726 
727  destword &= tempword;
728  }
729 
730  if (clearBoundPixH)
731  {
732  if (col == 0)
733  {
734  destword &= rwmask;
735  }
736  else if (col == (wpl - 1))
737  {
738  destword &= lwmask;
739  }
740  }
741 
742  *(dword + pos) = destword;
743 }\n
744 )
745 
746 KERNEL(
747 \n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
748  const int yp,
749  const int wpl, const int h,
750  const char clearBoundPixV, const int yn)
751 {
752  const int col = get_global_id(0);
753  const int row = get_global_id(1);
754  const unsigned int pos = row * wpl + col;
755  unsigned int tempword, destword;
756  int i, siter, eiter;
757 
758  //Ignore the execss
759  if (row >= h || col >= wpl)
760  return;
761 
762  destword = *(sword + pos);
763 
764  //Set start position and end position considering the boundary conditions
765  siter = (row - yp) < 0 ? 0 : (row - yp);
766  eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
767 
768  for (i = siter; i <= eiter; i++)
769  {
770  tempword = *(sword + i*wpl + col);
771 
772  destword &= tempword;
773  }
774 
775  //Clear boundary pixels
776  if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
777  {
778  destword = 0x0;
779  }
780 
781  *(dword + pos) = destword;
782 }\n
783 )
784 
785 // HistogramRect Kernel: Accumulate
786 // assumes 4 channels, i.e., bytes_per_pixel = 4
787 // assumes number of pixels is multiple of 8
788 // data is laid out as
789 // ch0 ch1 ...
790 // bin0 bin1 bin2... bin0...
791 // rpt0,1,2...256 rpt0,1,2...
792 KERNEL(
793 \n#define HIST_REDUNDANCY 256\n
794 \n#define GROUP_SIZE 256\n
795 \n#define HIST_SIZE 256\n
796 \n#define NUM_CHANNELS 4\n
797 \n#define HR_UNROLL_SIZE 8 \n
798 \n#define HR_UNROLL_TYPE uchar8 \n
799 
800 __attribute__((reqd_work_group_size(256, 1, 1)))
801 __kernel
802 void kernel_HistogramRectAllChannels(
803  __global const uchar8 *data,
804  uint numPixels,
805  __global uint *histBuffer) {
806 
807  // declare variables
808  uchar8 pixels;
809  int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
810 
811  // for each pixel/channel, accumulate in global memory
812  for (uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0)) {
813  pixels = data[pc];
814  // channel bin thread
815  atomic_inc(&histBuffer[0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset]); // ch0
816  atomic_inc(&histBuffer[0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset]); // ch0
817  atomic_inc(&histBuffer[1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset]); // ch1
818  atomic_inc(&histBuffer[1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset]); // ch1
819  atomic_inc(&histBuffer[2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset]); // ch2
820  atomic_inc(&histBuffer[2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset]); // ch2
821  atomic_inc(&histBuffer[3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset]); // ch3
822  atomic_inc(&histBuffer[3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset]); // ch3
823  }
824 }
825 )
826 
827 KERNEL(
828 // NUM_CHANNELS = 1
829 __attribute__((reqd_work_group_size(256, 1, 1)))
830 __kernel
831 void kernel_HistogramRectOneChannel(
832  __global const uchar8 *data,
833  uint numPixels,
834  __global uint *histBuffer) {
835 
836  // declare variables
837  uchar8 pixels;
838  int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
839 
840  // for each pixel/channel, accumulate in global memory
841  for (uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0)) {
842  pixels = data[pc];
843  // bin thread
844  atomic_inc(&histBuffer[pixels.s0*HIST_REDUNDANCY + threadOffset]);
845  atomic_inc(&histBuffer[pixels.s1*HIST_REDUNDANCY + threadOffset]);
846  atomic_inc(&histBuffer[pixels.s2*HIST_REDUNDANCY + threadOffset]);
847  atomic_inc(&histBuffer[pixels.s3*HIST_REDUNDANCY + threadOffset]);
848  atomic_inc(&histBuffer[pixels.s4*HIST_REDUNDANCY + threadOffset]);
849  atomic_inc(&histBuffer[pixels.s5*HIST_REDUNDANCY + threadOffset]);
850  atomic_inc(&histBuffer[pixels.s6*HIST_REDUNDANCY + threadOffset]);
851  atomic_inc(&histBuffer[pixels.s7*HIST_REDUNDANCY + threadOffset]);
852  }
853 }
854 )
855 
856 // HistogramRect Kernel: Reduction
857 // only supports 4 channels
858 // each work group handles a single channel of a single histogram bin
859 KERNEL(
860 __attribute__((reqd_work_group_size(256, 1, 1)))
861 __kernel
862 void kernel_HistogramRectAllChannelsReduction(
863  int n, // unused pixel redundancy
864  __global uint *histBuffer,
865  __global int* histResult) {
866 
867  // declare variables
868  int channel = get_group_id(0)/HIST_SIZE;
869  int bin = get_group_id(0)%HIST_SIZE;
870  int value = 0;
871 
872  // accumulate in register
873  for (uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
874  value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
875  }
876 
877  // reduction in local memory
878  __local int localHist[GROUP_SIZE];
879  localHist[get_local_id(0)] = value;
880  barrier(CLK_LOCAL_MEM_FENCE);
881  for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
882  if (get_local_id(0) < stride) {
883  value = localHist[ get_local_id(0)+stride];
884  }
885  barrier(CLK_LOCAL_MEM_FENCE);
886  if (get_local_id(0) < stride) {
887  localHist[ get_local_id(0)] += value;
888  }
889  barrier(CLK_LOCAL_MEM_FENCE);
890  }
891 
892  // write reduction to final result
893  if (get_local_id(0) == 0) {
894  histResult[get_group_id(0)] = localHist[0];
895  }
896 } // kernel_HistogramRectAllChannels
897 )
898 
899 
900 KERNEL(
901 // NUM_CHANNELS = 1
902 __attribute__((reqd_work_group_size(256, 1, 1)))
903 __kernel
904 void kernel_HistogramRectOneChannelReduction(
905  int n, // unused pixel redundancy
906  __global uint *histBuffer,
907  __global int* histResult) {
908 
909  // declare variables
910  // int channel = get_group_id(0)/HIST_SIZE;
911  int bin = get_group_id(0)%HIST_SIZE;
912  int value = 0;
913 
914  // accumulate in register
915  for (int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
916  value += histBuffer[ bin*HIST_REDUNDANCY+i];
917  }
918 
919  // reduction in local memory
920  __local int localHist[GROUP_SIZE];
921  localHist[get_local_id(0)] = value;
922  barrier(CLK_LOCAL_MEM_FENCE);
923  for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
924  if (get_local_id(0) < stride) {
925  value = localHist[ get_local_id(0)+stride];
926  }
927  barrier(CLK_LOCAL_MEM_FENCE);
928  if (get_local_id(0) < stride) {
929  localHist[ get_local_id(0)] += value;
930  }
931  barrier(CLK_LOCAL_MEM_FENCE);
932  }
933 
934  // write reduction to final result
935  if (get_local_id(0) == 0) {
936  histResult[get_group_id(0)] = localHist[0];
937  }
938 } // kernel_HistogramRectOneChannelReduction
939 )
940 
941 // ThresholdRectToPix Kernel
942 // only supports 4 channels
943 // imageData is input image (24-bits/pixel)
944 // pix is output image (1-bit/pixel)
945 KERNEL(
946 \n#define CHAR_VEC_WIDTH 4 \n
947 \n#define PIXELS_PER_WORD 32 \n
948 \n#define PIXELS_PER_BURST 8 \n
949 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
950  typedef union {
951  uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
952  uchar4 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
953  } charVec;
954 
955 __attribute__((reqd_work_group_size(256, 1, 1)))
956 __kernel
957 void kernel_ThresholdRectToPix(
958  __global const uchar4 *imageData,
959  int height,
960  int width,
961  int wpl, // words per line
962  __global int *thresholds,
963  __global int *hi_values,
964  __global int *pix) {
965 
966  // declare variables
967  int pThresholds[NUM_CHANNELS];
968  int pHi_Values[NUM_CHANNELS];
969  for (int i = 0; i < NUM_CHANNELS; i++) {
970  pThresholds[i] = thresholds[i];
971  pHi_Values[i] = hi_values[i];
972  }
973 
974  // for each word (32 pixels) in output image
975  for (uint w = get_global_id(0); w < wpl*height; w += get_global_size(0)) {
976  unsigned int word = 0; // all bits start at zero
977  // for each burst in word
978  for (int b = 0; b < BURSTS_PER_WORD; b++) {
979  // load burst
980  charVec pixels;
981  int offset = (w / wpl) * width;
982  offset += (w % wpl) * PIXELS_PER_WORD;
983  offset += b * PIXELS_PER_BURST;
984 
985  for (int i = 0; i < PIXELS_PER_BURST; ++i)
986  pixels.v[i] = imageData[offset + i];
987 
988  // for each pixel in burst
989  for (int p = 0; p < PIXELS_PER_BURST; p++) {
990  for (int c = 0; c < NUM_CHANNELS; c++) {
991  unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
992  if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
993  const uint kTopBit = 0x80000000;
994  word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31));
995  }
996  }
997  }
998  }
999  pix[w] = word;
1000  }
1001 }
1002 
1003 \n#define CHAR_VEC_WIDTH 8 \n
1004 \n#define PIXELS_PER_WORD 32 \n
1005 \n#define PIXELS_PER_BURST 8 \n
1006 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
1007  typedef union {
1008  uchar s[PIXELS_PER_BURST*1];
1009  uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
1010  } charVec1;
1011 
1012 __attribute__((reqd_work_group_size(256, 1, 1)))
1013 __kernel
1014 void kernel_ThresholdRectToPix_OneChan(
1015  __global const uchar8 *imageData,
1016  int height,
1017  int width,
1018  int wpl, // words per line of output image
1019  __global int *thresholds,
1020  __global int *hi_values,
1021  __global int *pix) {
1022 
1023  // declare variables
1024  int pThresholds[1];
1025  int pHi_Values[1];
1026  for (int i = 0; i < 1; i++) {
1027  pThresholds[i] = thresholds[i];
1028  pHi_Values[i] = hi_values[i];
1029  }
1030 
1031  // for each word (32 pixels) in output image
1032  for (uint w = get_global_id(0); w < wpl*height; w += get_global_size(0)) {
1033  unsigned int word = 0; // all bits start at zero
1034 
1035  // for each burst in word
1036  for (int b = 0; b < BURSTS_PER_WORD; b++) {
1037 
1038  // load burst
1039  charVec1 pixels;
1040  // for each char8 in burst
1041  pixels.v[0] = imageData[
1042  w*BURSTS_PER_WORD
1043  + b
1044  + 0 ];
1045 
1046  // for each pixel in burst
1047  for (int p = 0; p < PIXELS_PER_BURST; p++) {
1048 
1049  //int littleEndianIdx = p ^ 3;
1050  //int bigEndianIdx = p;
1051  int idx =
1052 \n#ifdef __ENDIAN_LITTLE__\n
1053  p ^ 3;
1054 \n#else\n
1055  p;
1056 \n#endif\n
1057  unsigned char pixChan = pixels.s[idx];
1058  if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
1059  const uint kTopBit = 0x80000000;
1060  word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31));
1061  }
1062  }
1063  }
1064  pix[w] = word;
1065  }
1066 }
1067 )
1068 
1069  ; // close char*
1070 
1071 #endif // USE_EXTERNAL_KERNEL
1072 #endif // TESSERACT_OPENCL_OCLKERNELS_H_
1073 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */