tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
normstrngs.h
1 /**********************************************************************
2  * File: normstrngs.h
3  * Description: Utilities to normalize and manipulate UTF-32 and
4  * UTF-8 strings.
5  * Author: Ranjith Unnikrishnan
6  * Created: Thu July 4 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
22 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
23 
24 #include <string>
25 #include <vector>
26 
27 #include "validator.h"
28 
29 namespace tesseract {
30 
31 // The standard unicode normalizations.
32 enum class UnicodeNormMode {
33  kNFD,
34  kNFC,
35  kNFKD,
36  kNFKC,
37 };
38 
39 // To normalize away differences in punctuation that are ambiguous, like
40 // curly quotes and different widths of dash.
41 enum class OCRNorm {
42  kNone,
43  kNormalize,
44 };
45 
46 // To validate and normalize away some subtle differences that can occur in
47 // Indic scripts, eg ensuring that an explicit virama is always followed by
48 // a zero-width non-joiner.
49 enum class GraphemeNorm {
50  kNone,
51  kNormalize,
52 };
53 
54 // Normalizes a UTF8 string according to the given modes. Returns true on
55 // success. If false is returned, some failure or invalidity was present, and
56 // the result string is produced on a "best effort" basis.
57 bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
58  GraphemeNorm grapheme_normalize, const char* str8,
59  std::string* normalized);
60 // Normalizes a UTF8 string according to the given modes and splits into
61 // graphemes according to g_mode. Returns true on success. If false is returned,
62 // some failure or invalidity was present, and the result string is produced on
63 // a "best effort" basis.
64 bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
65  GraphemeNormMode g_mode, bool report_errors,
66  const char* str8,
67  std::vector<std::string>* graphemes);
68 
69 // Applies just the OCR-specific normalizations and return the normalized char.
71 
72 // Returns true if the OCRNormalized ch1 and ch2 are the same.
73 bool IsOCREquivalent(char32 ch1, char32 ch2);
74 
75 // Returns true if the value lies in the range of valid unicodes.
76 bool IsValidCodepoint(const char32 ch);
77 
78 // Returns true a code point has the White_Space Unicode property.
79 bool IsWhitespace(const char32 ch);
80 // Returns true if every char in the given (null-terminated) string has the
81 // White_Space Unicode property.
82 bool IsUTF8Whitespace(const char* text);
83 
84 // Returns the length of bytes of the prefix of 'text' that have the White_Space
85 // unicode property.
86 unsigned int SpanUTF8Whitespace(const char* text);
87 
88 // Returns the length of bytes of the prefix of 'text' that DO NOT have the
89 // White_Space unicode property.
90 unsigned int SpanUTF8NotWhitespace(const char* text);
91 
92 // Returns true if the char is interchange valid i.e. no C0 or C1 control codes
93 // (other than CR LF HT FF) and no non-characters.
94 bool IsInterchangeValid(const char32 ch);
95 // Same as above but restricted to 7-bit ASCII.
96 bool IsInterchangeValid7BitAscii(const char32 ch);
97 
98 // Convert a full-width UTF-8 string to half-width.
100 
101 } // namespace tesseract
102 
103 #endif // TESSERACT_CCUTIL_NORMSTRNGS_H_
OCRNorm
Definition: normstrngs.h:41
signed int char32
Definition: unichar.h:52
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:253
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
GraphemeNormMode
Definition: validator.h:34
Definition: baseapi.cpp:94
GraphemeNorm
Definition: normstrngs.h:49
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:147
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:243
UnicodeNormMode
Definition: normstrngs.h:32
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:223
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:218
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:172
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:214
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:204