11 #ifndef TESSERACT_OPENCL_OCLKERNELS_H_ 12 #define TESSERACT_OPENCL_OCLKERNELS_H_ 14 #ifndef USE_EXTERNAL_KERNEL 15 #define KERNEL(...) #__VA_ARGS__ "\n" 21 const char *kernel_src = KERNEL(
22 \n#ifdef KHR_DP_EXTENSION\n
23 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
24 \n#elif AMD_DP_EXTENSION\n
25 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
28 __kernel
void composeRGBPixel(__global uint *tiffdata,
int w,
int h,
int wpl, __global uint *output)
30 int i = get_global_id(1);
31 int j = get_global_id(0);
32 int tiffword,rval,gval,bval;
35 if ((i >= h) || (j >= w))
38 tiffword = tiffdata[i * w + j];
39 rval = ((tiffword) & 0xff);
40 gval = (((tiffword) >> 8) & 0xff);
41 bval = (((tiffword) >> 16) & 0xff);
42 output[i*wpl+j] = (rval << (8 * (
sizeof(uint) - 1 - 0))) | (gval << (8 * (
sizeof(uint) - 1 - 1))) | (bval << (8 * (
sizeof(uint) - 1 - 2)));
47 \n__kernel
void pixSubtract_inplace(__global
int *dword, __global
int *sword,
48 const int wpl,
const int h)
50 const unsigned int row = get_global_id(1);
51 const unsigned int col = get_global_id(0);
52 const unsigned int pos = row * wpl + col;
55 if (row >= h || col >= wpl)
58 *(dword + pos) &= ~(*(sword + pos));
63 \n__kernel
void pixSubtract(__global
int *dword, __global
int *sword,
64 const int wpl,
const int h, __global
int *outword)
66 const unsigned int row = get_global_id(1);
67 const unsigned int col = get_global_id(0);
68 const unsigned int pos = row * wpl + col;
71 if (row >= h || col >= wpl)
74 *(outword + pos) = *(dword + pos) & ~(*(sword + pos));
79 \n__kernel
void morphoDilateHor_5x5(__global
int *sword,__global
int *dword,
80 const int wpl,
const int h)
82 const unsigned int pos = get_global_id(0);
83 unsigned int prevword, nextword, currword,tempword;
84 unsigned int destword;
85 const int col = pos % wpl;
92 currword = *(sword + pos);
99 prevword = *(sword + pos - 1);
104 nextword = *(sword + pos + 1);
110 tempword = (prevword << (31)) | ((currword >> 1));
111 destword |= tempword;
113 tempword = (currword << 1) | (nextword >> (31));
114 destword |= tempword;
118 tempword = (prevword << (30)) | ((currword >> 2));
119 destword |= tempword;
121 tempword = (currword << 2) | (nextword >> (30));
122 destword |= tempword;
125 *(dword + pos) = destword;
131 \n__kernel
void morphoDilateVer_5x5(__global
int *sword,__global
int *dword,
132 const int wpl,
const int h)
134 const int col = get_global_id(0);
135 const int row = get_global_id(1);
136 const unsigned int pos = row * wpl + col;
137 unsigned int tempword;
138 unsigned int destword;
142 if (row >= h || col >= wpl)
145 destword = *(sword + pos);
148 i = (row - 2) < 0 ? row : (row - 2);
149 tempword = *(sword + i*wpl + col);
150 destword |= tempword;
153 i = (row - 1) < 0 ? row : (row - 1);
154 tempword = *(sword + i*wpl + col);
155 destword |= tempword;
158 i = (row >= (h - 1)) ? row : (row + 1);
159 tempword = *(sword + i*wpl + col);
160 destword |= tempword;
163 i = (row >= (h - 2)) ? row : (row + 2);
164 tempword = *(sword + i*wpl + col);
165 destword |= tempword;
167 *(dword + pos) = destword;
172 \n__kernel
void morphoDilateHor(__global
int *sword,__global
int *dword,
const int xp,
const int xn,
const int wpl,
const int h)
174 const int col = get_global_id(0);
175 const int row = get_global_id(1);
176 const unsigned int pos = row * wpl + col;
177 unsigned int parbitsxp, parbitsxn, nwords;
178 unsigned int destword, tempword, lastword, currword;
179 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
180 int i, j, siter, eiter;
183 if (pos >= (wpl*h) || (xn < 1 && xp < 1))
186 currword = *(sword + pos);
198 siter = (col - nwords);
199 eiter = (col + nwords);
205 firstword = *(sword + pos - 1);
208 if (col == (wpl - 1))
211 secondword = *(sword + pos + 1);
214 for (i = 1; i <= parbitsxp; i++)
217 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
219 destword |= tempword;
222 tempword = (currword << i) | (secondword >> (32 - i));
223 destword |= tempword;
231 destword |= firstword;
235 destword |= secondword;
238 *(dword + pos) = destword;
245 firstword = *(sword + row*wpl + siter);
250 lastword = *(sword + row*wpl + eiter);
252 for (i = 1; i < nwords; i++)
258 secondword = *(sword + row*wpl + siter + i);
260 lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
262 firstword = secondword;
264 if ((siter + i + 1) < 0)
267 secondword = *(sword + row*wpl + siter + i + 1);
269 lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
272 if ((eiter - i) >= wpl)
275 firstword = *(sword + row*wpl + eiter - i);
277 rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
279 lastword = firstword;
280 if ((eiter - i - 1) >= wpl)
283 firstword = *(sword + row*wpl + eiter - i - 1);
285 rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
287 for (j = 1; j < 32; j++)
290 tempword = (lprevword << j) | (lnextword >> (32 - j));
291 destword |= tempword;
294 tempword = (rprevword << j) | (rnextword >> (32 - j));
295 destword |= tempword;
298 destword |= lprevword;
299 destword |= lnextword;
300 destword |= rprevword;
301 destword |= rnextword;
303 lastword = firstword;
304 firstword = secondword;
307 *(dword + pos) = destword;
312 \n__kernel
void morphoDilateHor_32word(__global
int *sword,__global
int *dword,
314 const int wpl,
const int h,
317 const int col = get_global_id(0);
318 const int row = get_global_id(1);
319 const unsigned int pos = row * wpl + col;
320 unsigned int prevword, nextword, currword,tempword;
321 unsigned int destword;
325 if (pos >= (wpl * h))
328 currword = *(sword + pos);
335 prevword = *(sword + pos - 1);
340 nextword = *(sword + pos + 1);
342 for (i = 1; i <= halfwidth; i++)
345 if (i == halfwidth && isEven)
351 tempword = (prevword << (32-i)) | ((currword >> i));
354 destword |= tempword;
357 tempword = (currword << i) | (nextword >> (32 - i));
359 destword |= tempword;
362 *(dword + pos) = destword;
367 \n__kernel
void morphoDilateVer(__global
int *sword,__global
int *dword,
369 const int wpl,
const int h,
372 const int col = get_global_id(0);
373 const int row = get_global_id(1);
374 const unsigned int pos = row * wpl + col;
375 unsigned int tempword;
376 unsigned int destword;
380 if (row >= h || col >= wpl)
383 destword = *(sword + pos);
386 siter = (row - yn) < 0 ? 0 : (row - yn);
387 eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
389 for (i = siter; i <= eiter; i++)
391 tempword = *(sword + i*wpl + col);
393 destword |= tempword;
396 *(dword + pos) = destword;
401 \n__kernel
void morphoErodeHor_5x5(__global
int *sword,__global
int *dword,
402 const int wpl,
const int h)
404 const unsigned int pos = get_global_id(0);
405 unsigned int prevword, nextword, currword,tempword;
406 unsigned int destword;
407 const int col = pos % wpl;
410 if (pos >= (wpl * h))
413 currword = *(sword + pos);
420 prevword = *(sword + pos - 1);
425 nextword = *(sword + pos + 1);
431 tempword = (prevword << (31)) | ((currword >> 1));
432 destword &= tempword;
434 tempword = (currword << 1) | (nextword >> (31));
435 destword &= tempword;
439 tempword = (prevword << (30)) | ((currword >> 2));
440 destword &= tempword;
442 tempword = (currword << 2) | (nextword >> (30));
443 destword &= tempword;
446 *(dword + pos) = destword;
452 \n__kernel
void morphoErodeVer_5x5(__global
int *sword,__global
int *dword,
453 const int wpl,
const int h,
454 const int fwmask,
const int lwmask)
456 const int col = get_global_id(0);
457 const int row = get_global_id(1);
458 const unsigned int pos = row * wpl + col;
459 unsigned int tempword;
460 unsigned int destword;
464 if (row >= h || col >= wpl)
467 destword = *(sword + pos);
469 if (row < 2 || row >= (h - 2))
478 tempword = *(sword + i*wpl + col);
479 destword &= tempword;
484 tempword = *(sword + i*wpl + col);
485 destword &= tempword;
490 tempword = *(sword + i*wpl + col);
491 destword &= tempword;
496 tempword = *(sword + i*wpl + col);
497 destword &= tempword;
503 if (col == (wpl - 1))
510 *(dword + pos) = destword;
515 \n__kernel
void morphoErodeHor(__global
int *sword,__global
int *dword,
const int xp,
const int xn,
const int wpl,
516 const int h,
const char isAsymmetric,
const int rwmask,
const int lwmask)
518 const int col = get_global_id(0);
519 const int row = get_global_id(1);
520 const unsigned int pos = row * wpl + col;
521 unsigned int parbitsxp, parbitsxn, nwords;
522 unsigned int destword, tempword, lastword, currword;
523 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
524 int i, j, siter, eiter;
527 if (pos >= (wpl*h) || (xn < 1 && xp < 1))
530 currword = *(sword + pos);
542 siter = (col - nwords);
543 eiter = (col + nwords);
547 firstword = 0xffffffff;
549 firstword = *(sword + pos - 1);
552 if (col == (wpl - 1))
553 secondword = 0xffffffff;
555 secondword = *(sword + pos + 1);
558 for (i = 1; i <= parbitsxp; i++)
561 tempword = (firstword << (32-i)) | ((currword >> i));
562 destword &= tempword;
565 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
568 destword &= tempword;
576 destword &= firstword;
580 destword &= secondword;
588 if (col == (wpl - 1))
592 *(dword + pos) = destword;
597 firstword = 0xffffffff;
599 firstword = *(sword + row*wpl + siter);
602 lastword = 0xffffffff;
604 lastword = *(sword + row*wpl + eiter);
607 for (i = 1; i < nwords; i++)
611 secondword = 0xffffffff;
613 secondword = *(sword + row*wpl + siter + i);
615 lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
617 firstword = secondword;
619 if ((siter + i + 1) < 0)
620 secondword = 0xffffffff;
622 secondword = *(sword + row*wpl + siter + i + 1);
624 lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
627 if ((eiter - i) >= wpl)
628 firstword = 0xffffffff;
630 firstword = *(sword + row*wpl + eiter - i);
632 rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
634 lastword = firstword;
635 if ((eiter - i - 1) >= wpl)
636 firstword = 0xffffffff;
638 firstword = *(sword + row*wpl + eiter - i - 1);
640 rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
642 for (j = 0; j < 32; j++)
645 tempword = (lprevword << j) | (lnextword >> (32 - j));
646 destword &= tempword;
649 tempword = (rprevword << j) | (rnextword >> (32 - j));
650 destword &= tempword;
653 destword &= lprevword;
654 destword &= lnextword;
655 destword &= rprevword;
656 destword &= rnextword;
658 lastword = firstword;
659 firstword = secondword;
665 if (col < (nwords - 1))
667 else if (col == (nwords - 1))
669 else if (col > (wpl - nwords))
671 else if (col == (wpl - nwords))
675 *(dword + pos) = destword;
680 \n__kernel
void morphoErodeHor_32word(__global
int *sword,__global
int *dword,
681 const int halfwidth,
const int wpl,
682 const int h,
const char clearBoundPixH,
683 const int rwmask,
const int lwmask,
686 const int col = get_global_id(0);
687 const int row = get_global_id(1);
688 const unsigned int pos = row * wpl + col;
689 unsigned int prevword, nextword, currword,tempword, destword;
693 if (pos >= (wpl * h))
696 currword = *(sword + pos);
703 prevword = *(sword + pos - 1);
708 nextword = *(sword + pos + 1);
710 for (i = 1; i <= halfwidth; i++)
713 tempword = (prevword << (32-i)) | ((currword >> i));
715 destword &= tempword;
718 if (i == halfwidth && isEven)
720 tempword = 0xffffffff;
724 tempword = (currword << i) | (nextword >> (32 - i));
727 destword &= tempword;
736 else if (col == (wpl - 1))
742 *(dword + pos) = destword;
747 \n__kernel
void morphoErodeVer(__global
int *sword,__global
int *dword,
749 const int wpl,
const int h,
750 const char clearBoundPixV,
const int yn)
752 const int col = get_global_id(0);
753 const int row = get_global_id(1);
754 const unsigned int pos = row * wpl + col;
755 unsigned int tempword, destword;
759 if (row >= h || col >= wpl)
762 destword = *(sword + pos);
765 siter = (row - yp) < 0 ? 0 : (row - yp);
766 eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
768 for (i = siter; i <= eiter; i++)
770 tempword = *(sword + i*wpl + col);
772 destword &= tempword;
776 if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
781 *(dword + pos) = destword;
793 \n#define HIST_REDUNDANCY 256\n
794 \n#define GROUP_SIZE 256\n
795 \n#define HIST_SIZE 256\n
796 \n#define NUM_CHANNELS 4\n
797 \n#define HR_UNROLL_SIZE 8 \n
798 \n#define HR_UNROLL_TYPE uchar8 \n
800 __attribute__((reqd_work_group_size(256, 1, 1)))
802 void kernel_HistogramRectAllChannels(
803 __global
const uchar8 *data,
805 __global uint *histBuffer) {
809 int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
812 for (uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0)) {
815 atomic_inc(&histBuffer[0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset]);
816 atomic_inc(&histBuffer[0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset]);
817 atomic_inc(&histBuffer[1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset]);
818 atomic_inc(&histBuffer[1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset]);
819 atomic_inc(&histBuffer[2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset]);
820 atomic_inc(&histBuffer[2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset]);
821 atomic_inc(&histBuffer[3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset]);
822 atomic_inc(&histBuffer[3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset]);
829 __attribute__((reqd_work_group_size(256, 1, 1)))
831 void kernel_HistogramRectOneChannel(
832 __global
const uchar8 *data,
834 __global uint *histBuffer) {
838 int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
841 for (uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0)) {
844 atomic_inc(&histBuffer[pixels.s0*HIST_REDUNDANCY + threadOffset]);
845 atomic_inc(&histBuffer[pixels.s1*HIST_REDUNDANCY + threadOffset]);
846 atomic_inc(&histBuffer[pixels.s2*HIST_REDUNDANCY + threadOffset]);
847 atomic_inc(&histBuffer[pixels.s3*HIST_REDUNDANCY + threadOffset]);
848 atomic_inc(&histBuffer[pixels.s4*HIST_REDUNDANCY + threadOffset]);
849 atomic_inc(&histBuffer[pixels.s5*HIST_REDUNDANCY + threadOffset]);
850 atomic_inc(&histBuffer[pixels.s6*HIST_REDUNDANCY + threadOffset]);
851 atomic_inc(&histBuffer[pixels.s7*HIST_REDUNDANCY + threadOffset]);
860 __attribute__((reqd_work_group_size(256, 1, 1)))
862 void kernel_HistogramRectAllChannelsReduction(
864 __global uint *histBuffer,
865 __global
int* histResult) {
868 int channel = get_group_id(0)/HIST_SIZE;
869 int bin = get_group_id(0)%HIST_SIZE;
873 for (uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
874 value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
878 __local
int localHist[GROUP_SIZE];
879 localHist[get_local_id(0)] = value;
880 barrier(CLK_LOCAL_MEM_FENCE);
881 for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
882 if (get_local_id(0) < stride) {
883 value = localHist[ get_local_id(0)+stride];
885 barrier(CLK_LOCAL_MEM_FENCE);
886 if (get_local_id(0) < stride) {
887 localHist[ get_local_id(0)] += value;
889 barrier(CLK_LOCAL_MEM_FENCE);
893 if (get_local_id(0) == 0) {
894 histResult[get_group_id(0)] = localHist[0];
902 __attribute__((reqd_work_group_size(256, 1, 1)))
904 void kernel_HistogramRectOneChannelReduction(
906 __global uint *histBuffer,
907 __global
int* histResult) {
911 int bin = get_group_id(0)%HIST_SIZE;
915 for (
int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
916 value += histBuffer[ bin*HIST_REDUNDANCY+i];
920 __local
int localHist[GROUP_SIZE];
921 localHist[get_local_id(0)] = value;
922 barrier(CLK_LOCAL_MEM_FENCE);
923 for (
int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
924 if (get_local_id(0) < stride) {
925 value = localHist[ get_local_id(0)+stride];
927 barrier(CLK_LOCAL_MEM_FENCE);
928 if (get_local_id(0) < stride) {
929 localHist[ get_local_id(0)] += value;
931 barrier(CLK_LOCAL_MEM_FENCE);
935 if (get_local_id(0) == 0) {
936 histResult[get_group_id(0)] = localHist[0];
946 \n#define CHAR_VEC_WIDTH 4 \n
947 \n#define PIXELS_PER_WORD 32 \n
948 \n#define PIXELS_PER_BURST 8 \n
949 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
951 uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
952 uchar4 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
955 __attribute__((reqd_work_group_size(256, 1, 1)))
957 void kernel_ThresholdRectToPix(
958 __global const uchar4 *imageData,
962 __global
int *thresholds,
963 __global
int *hi_values,
967 int pThresholds[NUM_CHANNELS];
968 int pHi_Values[NUM_CHANNELS];
969 for (
int i = 0; i < NUM_CHANNELS; i++) {
970 pThresholds[i] = thresholds[i];
971 pHi_Values[i] = hi_values[i];
975 for (uint w = get_global_id(0); w < wpl*height; w += get_global_size(0)) {
976 unsigned int word = 0;
978 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
981 int offset = (w / wpl) * width;
982 offset += (w % wpl) * PIXELS_PER_WORD;
983 offset += b * PIXELS_PER_BURST;
985 for (
int i = 0; i < PIXELS_PER_BURST; ++i)
986 pixels.v[i] = imageData[offset + i];
989 for (
int p = 0; p < PIXELS_PER_BURST; p++) {
990 for (
int c = 0; c < NUM_CHANNELS; c++) {
991 unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
992 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
993 const uint kTopBit = 0x80000000;
994 word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31));
1003 \n#define CHAR_VEC_WIDTH 8 \n
1004 \n#define PIXELS_PER_WORD 32 \n
1005 \n#define PIXELS_PER_BURST 8 \n
1006 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
1008 uchar s[PIXELS_PER_BURST*1];
1009 uchar8 v[(PIXELS_PER_BURST*1)/CHAR_VEC_WIDTH];
1012 __attribute__((reqd_work_group_size(256, 1, 1)))
1014 void kernel_ThresholdRectToPix_OneChan(
1015 __global const uchar8 *imageData,
1019 __global
int *thresholds,
1020 __global
int *hi_values,
1021 __global
int *pix) {
1026 for (
int i = 0; i < 1; i++) {
1027 pThresholds[i] = thresholds[i];
1028 pHi_Values[i] = hi_values[i];
1032 for (uint w = get_global_id(0); w < wpl*height; w += get_global_size(0)) {
1033 unsigned int word = 0;
1036 for (
int b = 0; b < BURSTS_PER_WORD; b++) {
1041 pixels.v[0] = imageData[
1047 for (
int p = 0; p < PIXELS_PER_BURST; p++) {
1052 \n#ifdef __ENDIAN_LITTLE__\n
1057 unsigned char pixChan = pixels.s[idx];
1058 if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
1059 const uint kTopBit = 0x80000000;
1060 word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31));
1071 #endif // USE_EXTERNAL_KERNEL 1072 #endif // TESSERACT_OPENCL_OCLKERNELS_H_