RecAPI
|
Language, Character Set and Code Page Handling Module. More...
Classes | |
struct | LANGUAGE_INFO |
Language info. More... | |
Defines | |
#define | UNICODE_REJECTED 0x0fffd |
The rejected symbol in Unicode strings. See kRecSetRejectionSymbol and kRecGetRejectionSymbol. | |
#define | UNICODE_MISSING 0x0fffc |
The missing symbol in Unicode strings. See kRecSetMissingSymbol and kRecGetMissingSymbol. | |
#define | MAXCPNAMELEN 32 |
Maximal buffer length needed for Code Page name. | |
Typedefs | |
typedef OUTCODEPAGETYPE * | LPOUTCODEPAGETYPE |
Pointer to an OUTCODEPAGETYPE. | |
Enumerations | |
enum | CHR_FILTER { FILTER_DEFAULT = 0, FILTER_DIGIT = 1, FILTER_UPPERCASE = 2, FILTER_LOWERCASE = 4, FILTER_PUNCTUATION = 8, FILTER_MISCELLANEOUS = 16, FILTER_PLUS = 32, FILTER_USER_DICT = 64, FILTER_ALL, FILTER_ALPHA = (FILTER_UPPERCASE | FILTER_LOWERCASE), FILTER_NUMBERS = (FILTER_DIGIT | FILTER_PLUS), FILTER_SIZE = 128 } |
Recognition filters. More... | |
enum | LANG_ENA { LANG_DISABLED = 0, LANG_ENABLED } |
Language enable/disable. More... | |
enum | MANAGE_LANG { SET_LANG = 0, ADD_LANG, REMOVE_LANG, INVERT_LANG, IS_LANG_ENABLED } |
Language management actions. More... | |
enum | LANGUAGES { LANG_ALL = -1024, LANG_ALL_LATIN = -1023, LANG_ALL_ASIAN = -1022, LANG_START = -3, LANG_UD = -3, LANG_AUTO = -2, LANG_NO = -1, LANG_ENG = 0, LANG_GER, LANG_FRE, LANG_DUT, LANG_NOR, LANG_SWE, LANG_FIN, LANG_DAN, LANG_ICE, LANG_POR, LANG_SPA, LANG_CAT, LANG_GAL, LANG_ITA, LANG_MAL, LANG_GRE, LANG_POL, LANG_CZH, LANG_SLK, LANG_HUN, LANG_SLN, LANG_CRO, LANG_ROM, LANG_ALB, LANG_TUR, LANG_EST, LANG_LAT, LANG_LIT, LANG_ESP, LANG_SRL, LANG_SRB, LANG_MAC, LANG_MOL, LANG_BUL, LANG_BEL, LANG_UKR, LANG_RUS, LANG_CHE, LANG_KAB, LANG_AFR, LANG_AYM, LANG_BAS, LANG_BEM, LANG_BLA, LANG_BRE, LANG_BRA, LANG_BUG, LANG_CHA, LANG_CHU, LANG_COR, LANG_CRW, LANG_ESK, LANG_FAR, LANG_FIJ, LANG_FRI, LANG_FRU, LANG_GLI, LANG_GLS, LANG_GAN, LANG_GUA, LANG_HAN, LANG_HAW, LANG_IDO, LANG_IND, LANG_INT, LANG_KAS, LANG_KAW, LANG_KIK, LANG_KON, LANG_KPE, LANG_KUR, LANG_LTN, LANG_LUB, LANG_LUX, LANG_MLG, LANG_MLY, LANG_MLN, LANG_MAO, LANG_MAY, LANG_MIA, LANG_MIN, LANG_MOH, LANG_NAH, LANG_NYA, LANG_OCC, LANG_OJI, LANG_PAP, LANG_PID, LANG_PRO, LANG_QUE, LANG_RHA, LANG_ROY, LANG_RUA, LANG_RUN, LANG_SAM, LANG_SAR, LANG_SHO, LANG_SIO, LANG_SMI, LANG_SML, LANG_SMN, LANG_SMS, LANG_SOM, LANG_SOT, LANG_SUN, LANG_SWA, LANG_SWZ, LANG_TAG, LANG_TAH, LANG_TIN, LANG_TON, LANG_TUN, LANG_VIS, LANG_WEL, LANG_WEN, LANG_WOL, LANG_XHO, LANG_ZAP, LANG_ZUL, LANG_JPN, LANG_CHS, LANG_CHT, LANG_KRN, LANG_THA, LANG_ARA, LANG_HEB, LANG_VIE, LANG_SIZE } |
Possible languages. More... | |
enum | CONTINENT { C_EUROPE = 0x0001, C_ASIA = 0x0002, C_AFRICA = 0x0004, C_OCEANIA = 0x0008, C_LAMERICA = 0x0010, C_NAMERICA = 0x0020, C_INTERNATIONAL = 0x0040 } |
Continent ID. More... | |
enum | BASIC_LANGUAGE_CHARSET { B_OTH = 0, B_BAS = 1, B_LAT = 2, B_GRE = 4, B_CYR = 8, B_ASN = 16, B_RTL = 32 } |
Character set bases. More... | |
enum | RM_FLAGS |
Recognition Engines supporting a language. More... | |
enum | LANGUAGE_CODE { LANGCODE_ALL = 0, LANGCODE_ENGLISH, LANGCODE_INT_3, LANGCODE_639_1, LANGCODE_639_2B, LANGCODE_639_3, LANGCODE_WIN_3 } |
Language code type. More... | |
enum | OUTCODEPAGETYPE { CODEP_UNKNOWN = 0, SPECIFIC, ASCII_BASED, ANSI_BASED, MAC_BASED, INTERNAL_CP, ASIAN_CODEPAGE } |
Code page types. More... | |
Functions | |
RECERR RECAPIKRN | kRecSetLanguages (int sid, LANG_ENA *pLanguages) |
Setting languages. | |
RECERR RECAPIKRN | kRecGetLanguages (int sid, LANG_ENA *pLanguages) |
Getting languages. | |
RECERR RECAPIKRN | kRecManageLanguages (int sid, MANAGE_LANG action, LANGUAGES language) |
Managing enabled languages. | |
RECERR RECAPIKRN | kRecSetSingleLanguageDetection (int sid, INTBOOL bEnable) |
Automatic Single Language Detection. | |
RECERR RECAPIKRN | kRecGetSingleLanguageDetection (int sid, INTBOOL *pbEnable) |
Getting the single language detection flag. | |
RECERR RECAPIKRN | kRecGetPageLanguages (HPAGE hPage, LANG_ENA *pOcrLanguages) |
Getting languages of the page. | |
RECERR RECAPIKRN | kRecGetLanguageInfo (LANGUAGES lang, LANGUAGE_INFO *pInfo) |
Getting information about a language. | |
RECERR RECAPIKRN | kRecFindLanguages (const LANGUAGE_INFO *pInfo, LANG_ENA *pLanguages) |
Searching for languages. | |
RECERR RECAPIKRN | kRecFindLanguage (LPCTSTR pLangName, LANGUAGES *pLanguage) |
Searching for languages. | |
RECERR RECAPIKRN | kRecFindLanguageEx (LANGUAGE_CODE coding, LPCTSTR pLangName, LANGUAGES *pLanguage, LANG_ENA *pLanguages) |
Searching for languages. | |
RECERR RECAPIKRN | kRecSetLanguagesPlus (int sid, LPCWSTR pOcrLplus) |
Setting LanguagesPlus characters. | |
RECERR RECAPIKRN | kRecGetLanguagesPlus (int sid, LPWSTR pOcrLplus, size_t iBSize) |
Getting LanguagesPlus characters. | |
RECERR RECAPIKRN | kRecSetDefaultFilter (int sid, CHR_FILTER Glfilter) |
Changing global character set filter. | |
RECERR RECAPIKRN | kRecGetDefaultFilter (int sid, CHR_FILTER *pGlfilter) |
Getting the global character set filter. | |
RECERR RECAPIKRN | kRecSetFilterPlus (int sid, LPCWSTR pFilterPlus) |
Setting FilterPlus characters. | |
RECERR RECAPIKRN | kRecGetFilterPlus (int sid, LPWSTR pFilterPlus, size_t iSize) |
Getting FilterPlus characters. | |
RECERR RECAPIKRN | kRecSetRejectionSymbol (int sid, WCHAR wRej) |
Setting the rejection symbol character. | |
RECERR RECAPIKRN | kRecGetRejectionSymbol (int sid, LPWCH pwRej) |
Getting the rejection symbol character. | |
RECERR RECAPIKRN | kRecSetMissingSymbol (int sid, WCHAR wMiss) |
Setting the missing symbol character. | |
RECERR RECAPIKRN | kRecGetMissingSymbol (int sid, LPWCH pwMiss) |
Getting missing symbol character. | |
RECERR RECAPIKRN | kRecSetCodePage (int sid, LPCTSTR pCodePageName) |
Setting the code page. | |
RECERR RECAPIKRN | kRecGetCodePage (int sid, LPTSTR pCodePageName, size_t buflen) |
Getting the code page name. | |
RECERR RECAPIKRN | kRecGetCodePageInfo (LPCTSTR pCodePageName, LPTSTR pDesc, size_t size, LPOUTCODEPAGETYPE pCodePageType) |
Getting information about the code page. | |
RECERR RECAPIKRN | kRecCheckCodePage (int sid, LPWSTR pMissingChrs, size_t buflen) |
Checking the code page. | |
RECERR RECAPIKRN | kRecGetFirstCodePage (LPTSTR pCodePageName, size_t buflen) |
Starting enumeration of code pages. | |
RECERR RECAPIKRN | kRecGetNextCodePage (LPTSTR pCodePageName, size_t buflen) |
Performing enumeration of code pages. | |
RECERR RECAPIKRN | kRecConvertCodePage2Unicode (int sid, const LPBYTE pChar, size_t *pBuffLen, LPWCH pUniCode) |
Converting from the current code page to UNICODE. | |
RECERR RECAPIKRN | kRecConvertUnicode2CodePage (int sid, WCHAR UniCode, LPBYTE pChar, size_t *pBuffLen) |
Converting from UNICODE to the current code page. |
Language, Character Set and Code Page Handling Module.
This module handles language, character set and code page related settings and their combinations.
The processing language must be specified before calling any processing function on a page. You may define one or more languages with the kRecSetLanguages or kRecManageLanguages functions. The languages specify both the set of characters to recognize and - if spell checking is enabled - the dictionaries to use. If more than one language is enabled automatic language detection is done. Automatic detection has two working modes:
CCJK and Arabic languages can be recognized one language at a time only (but English characters are automatically enabled), so only the second, Single Language Detection mode is supported when more than one CCJK languages and/or the Arabic one are enabled.
NOTE: Single Language Detection of Thai and Hebrew languages are not supported. Very clean documents in Greek, Russian and other Cyrillic languages can be processed with Single Language Detection, but making your application depend on automatic detection of these languages is not encouraged.
The current code page is specified by the setting Kernel.Chr.CodePage. Its default value is -1 meaning "Auto". Auto code page means the current code page comes from the setting Kernel.Chr.CodePage.Default. The default value of this latter setting on Windows is the code page of the current OS, on Linux and Mac it is UTF-8. See also kRecGetCodePage.
The structure LANGUAGE_INFO provides information about a selected language and particular abbreviations of its name. This module supports the CSDK internal language codes and the following language code standards: ISO/DIS 639-1, ISO/DIS 639-2/B, ISO/DIS 639-3 and Windows 3-letter language codes. See the list of language identifiers for details.
In the ISO 639-3 standard there are languages missing. CSDK defines additional local identifiers for them as follows:
qsl | Serbian (Latin) |
qbp | Brazilian |
qes | Eskimo |
qti | Pirez |
qis | Visayan |
qcs | Chinese (Simplified) |
qct | Chinese (Traditional) |
CSDK extends the ISO 639-3 standard with the following codes coming from ISO 639-2/B:
MYN | Mayan languages |
NAH | Nahuatl languages |
SMI | Sami languages |
WEN | Sorbian languages |
CSDK supports all the 2-letter codes of ISO 639-1 (if its language is supported, see LANGUAGES). There are some non-2-letter codes in this standard that are supported by kRecFindLanguages and similar functions as shown in the following table. The Norwegian language has more codes in addition to the general 'no'
, these are also supported:
Lt-sr | Serbian (Latin) |
Cy-sr | Serbian (Cyrillic) |
pt-BR | Brazilian |
nb, nn | Norwegian |
Character set bases.
Basic character set types of languages. See LANGUAGE_INFO.
enum CHR_FILTER |
Recognition filters.
This enum lists available Character Set filter elements. Language environment can be narrowed down by specifying Character Set filters. The name of each filter element denotes the category of characters it validates. A filter is built from one or more filter elements by combining (binary OR-ing) them. There are five disjunct elements, a special one and some pre-defined, combined ones. The filters can have an effect either at zone level (by specifying the zone's filter field) or globally, at page level (defined by the kRecSetDefaultFilter function). Use the FILTER_ALL
value to set no filtering.
FILTER_ALL
, FILTER_DIGIT
and FILTER_ALPHA
), RM_HNR (FILTER_ALL
, FILTER_DIGIT
, FILTER_PUNCTUATION
and FILTER_MISCELLANEOUS
). FILTER_PLUS
characters to the Character Set defined by the language environment, the filter value should be: FILTER_ALL | FILTER_PLUS
. FILTER_PLUS
characters to the filtered Character Set, place FILTER_PLUS
along with the other required filters. For example, to enable only digits and FILTER_PLUS
characters, use: FILTER_DIGIT | FILTER_PLUS
. FILTER_PLUS
characters only, FILTER_PLUS
must be the only filter element in the zone structure field. This even prevents language selection from validating letters in the current zone. FILTER_DEFAULT |
Use this value to have the zone handled globally. Do not combine this with any other filters. |
FILTER_DIGIT |
[Disjunct filter] Recognition of numerals only. E.g.: "3" (Digit Three). |
FILTER_UPPERCASE |
[Disjunct filter] Recognition of uppercase letters only, including accented ones. E.g.: "A" (Capital A). |
FILTER_LOWERCASE |
[Disjunct filter] Recognition of lowercase letters only, including accented ones. E.g.: "a" (Lowercase a). |
FILTER_PUNCTUATION |
[Disjunct filter] Recognition of punctuation signs only. E.g.: "!" (Exclamation Mark). |
FILTER_MISCELLANEOUS |
[Disjunct filter] Recognition of other miscellaneous characters only. E.g.: "+" (Plus sign). |
FILTER_PLUS |
[Special, combinable filter] Enables the use of the FilterPlus characters specified by the kRecSetFilterPlus function. The FilterPlus characters are added after any kind of filtering. |
FILTER_USER_DICT |
[Special, combinable filter] Recognition of characters from the user dictionary. |
FILTER_ALL |
[Pre-defined combined filter] Since all elements are enabled, there is no filtering. |
FILTER_ALPHA |
[Pre-defined combined filter] Recognition of upper and lowercase letters only. |
FILTER_NUMBERS |
[Pre-defined combined filter] Recognition of digits and the FilterPlus characters set by the kRecSetFilterPlus function. |
FILTER_SIZE |
Number of possible combinations of the disjunct filters. |
enum CONTINENT |
Continent ID.
This enum can be used for identifying the geographical location, where a given language is spoken. See LANGUAGE_INFO.
enum LANG_ENA |
Language enable/disable.
This defines the possible values for the language selection in the Language environment definition. This is supplied by the enum LANGUAGES and used by the function kRecSetLanguages.
enum LANGUAGE_CODE |
Language code type.
One of these values can be used with the kRecFindLanguageEx function to specify the type of the language name abbreviation code to search.
enum LANGUAGES |
Possible languages.
This enum identifies the different languages supported directly by the Engine. In the Engine these languages are used in two different places:
LANG_ALL |
Use with kRecManageLanguages only! See details there. |
LANG_ALL_LATIN |
Use with kRecManageLanguages only! See details there. |
LANG_ALL_ASIAN |
Use with kRecManageLanguages only! See details there. |
LANG_START |
First 'Special' language ID |
LANG_UD |
User dictionary |
LANG_AUTO |
Automatic spell checking language selection. Use with kRecSetSpellLanguage only! See details there. (Default for spell checking) |
LANG_NO |
No spell checking language selection. Use with kRecSetSpellLanguage only! See details there. |
LANG_ENG |
English language selection. Spelling supported! (Default for recognition). ISO/DIS 639-3 code is 'eng'. |
LANG_GER |
German language selection. Spelling supported! ISO/DIS 639-3 code is 'deu'. |
LANG_FRE |
French language selection. Spelling supported! ISO/DIS 639-3 code is 'fra'. |
LANG_DUT |
Dutch language selection. Spelling supported! ISO/DIS 639-3 code is 'nld'. |
LANG_NOR |
Norwegian language selection. Spelling supported! ISO/DIS 639-3 code is 'nor'. |
LANG_SWE |
Swedish language selection. Spelling supported! ISO/DIS 639-3 code is 'swe'. |
LANG_FIN |
Finnish language selection. Spelling supported! ISO/DIS 639-3 code is 'fin'. |
LANG_DAN |
Danish language selection. Spelling supported! ISO/DIS 639-3 code is 'dan'. |
LANG_ICE |
Icelandic language selection. ISO/DIS 639-3 code is 'isl'. |
LANG_POR |
Portuguese language selection. Spelling supported! ISO/DIS 639-3 code is 'por'. |
LANG_SPA |
Spanish language selection. Spelling supported! ISO/DIS 639-3 code is 'spa'. |
LANG_CAT |
Catalan language selection. Spelling supported! ISO/DIS 639-3 code is 'cat'. |
LANG_GAL |
Galician language selection. Alternate names are Gallegan and Gallego. Spoken in Spain and Portugal. ISO/DIS 639-3 code is 'glg'. |
LANG_ITA |
Italian language selection. Spelling supported! ISO/DIS 639-3 code is 'ita'. |
LANG_MAL |
Maltese language selection. ISO/DIS 639-3 code is 'mlt'. |
LANG_GRE |
Greek language selection. This selection includes the characters of the English language, as well. Spelling supported! ISO/DIS 639-3 code is 'ell'. |
LANG_POL |
Polish language selection. Spelling supported! ISO/DIS 639-3 code is 'pol'. |
LANG_CZH |
Czech language selection. Spelling supported! ISO/DIS 639-3 code is 'ces'. |
LANG_SLK |
Slovak language selection. ISO/DIS 639-3 code is 'slk'. |
LANG_HUN |
Hungarian language selection. Spelling supported! ISO/DIS 639-3 code is 'hun'. |
LANG_SLN |
Slovenian language selection. Spelling supported! ISO/DIS 639-3 code is 'slv'. |
LANG_CRO |
Croatian language selection. ISO/DIS 639-3 code is 'hrv'. |
LANG_ROM |
Romanian language selection. ISO/DIS 639-3 code is 'ron'. |
LANG_ALB |
Albanian language selection. ISO/DIS 639-3 code is 'sqi'. |
LANG_TUR |
Turkish language selection. Spelling supported! ISO/DIS 639-3 code is 'tur'. |
LANG_EST |
Estonian language selection. ISO/DIS 639-3 code is 'est'. |
LANG_LAT |
Latvian language selection. ISO/DIS 639-3 code is 'lav'. |
LANG_LIT |
Lithuanian language selection. ISO/DIS 639-3 code is 'lit'. |
LANG_ESP |
Esperanto language selection. Constructed language. Spelling supported! ISO/DIS 639-3 code is 'epo'. |
LANG_SRL |
Serbian (Latin) language selection. The Serbian language's ISO/DIS 639-3 code is 'srp', but the CSDK uses the 'qsl' local code for Latin Serbian writing. |
LANG_SRB |
Serbian (Cyrillic) language selection. This selection includes the characters of the English language, as well. The Serbian language's ISO/DIS 639-3 code is 'srp'. The CSDK uses this code for only the Cyrillic Serbian writing. |
LANG_MAC |
Macedonian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'mkd'. |
LANG_MOL |
Moldavian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'mol'. |
LANG_BUL |
Bulgarian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'bul'. |
LANG_BEL |
Byelorussian (Cyrillic) language selection. This selection includes the characters of the English language, as well. Other spellings Belarusian and White Russian. ISO/DIS 639-3 code is 'bel'. |
LANG_UKR |
Ukrainian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'ukr'. |
LANG_RUS |
Russian (Cyrillic) language selection. This selection includes the characters of the English language, as well. Spelling supported! ISO/DIS 639-3 code is 'rus'. |
LANG_CHE |
Chechen language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'che'. |
LANG_KAB |
Kabardian language selection. This selection includes the characters of the English language, as well. Alternate name is Beslenei. Spoken in Russia and Turkey. ISO/DIS 639-3 code is 'kbd'. |
LANG_AFR |
Afrikaans language selection. Spoken in Sourth Africa. ISO/DIS 639-3 code is 'afr'. |
LANG_AYM |
Aymara language selection. Spoken in Bolivia and Peru. ISO/DIS 639-3 code is 'aym'. |
LANG_BAS |
Basque language selection. ISO/DIS 639-3 code is 'eus'. |
LANG_BEM |
Bemba language selection. Alternate names are Chibemba, Ichibemba, Wemba, Chiwemba. Spoken in Zambia and Democratic Republic of Congo. ISO/DIS 639-3 code is 'bem'. |
LANG_BLA |
Blackfoot language selection. Alternate name is Blackfeet, Siksika and Pikanii. Spoken in Canada and USA. ISO/DIS 639-3 code is 'bla'. |
LANG_BRE |
Breton language selection. ISO/DIS 639-3 code is 'bre'. |
LANG_BRA |
Portuguese (Brazilian) language selection. Spelling supported! There is no language code for the Brazilian Portuguese language in the ISO/DIS 639-3 standard. The CSDK uses the 'qbp' local code. |
LANG_BUG |
Bugotu language selection. Spoken in Solomon Islands. ISO/DIS 639-3 code is 'bgt'. |
LANG_CHA |
Chamorro language selection. Spoken in Guam and Northern Mariana Islands. ISO/DIS 639-3 code is 'cha'. |
LANG_CHU |
Chuana or Tswana language selection. Spoken in Botswana and South Africa. ISO/DIS 639-3 code is 'tsn'. |
LANG_COR |
Corsican language selection. ISO/DIS 639-3 code is 'cos'. |
LANG_CRW |
Crow language selection. Spoken in USA. ISO/DIS 639-3 code is 'cro'. |
LANG_ESK |
Eskimo language selection. This language selection is a collection of Eskimo and Inuit languages. There is no language code for it in the ISO/DIS 639-3 standard. The CSDK uses the 'qes' local code. |
LANG_FAR |
Faroese language selection. ISO/DIS 639-3 code is 'fao'. |
LANG_FIJ |
Fijian language selection. ISO/DIS 639-3 code is 'fij'. |
LANG_FRI |
Frisian language selection. This is a macro language of three Frisian languages in Germany. ISO/DIS 639-3 code is 'fry'. |
LANG_FRU |
Friulian language selection. Spoken in Italy. ISO/DIS 639-3 code is 'fur'. |
LANG_GLI |
Gaelic Irish language selection. ISO/DIS 639-3 code is 'gle'. |
LANG_GLS |
Gaelic Scottish language selection. ISO/DIS 639-3 code is 'gla'. |
LANG_GAN |
Ganda or Luganda language selection. Spoken in Uganda. ISO/DIS 639-3 code is 'lug'. |
LANG_GUA |
Guarani language selection. This is a macro language of the Chiripa and some Guarani languages. Spoken in Paraguay, Argentina, Bolivia and Brazil. ISO/DIS 639-3 code is 'grn'. |
LANG_HAN |
Hani language selection. Alternate names are Hanhi, Haw and Hani Proper. Spoken in China, Laos and Viet Nam. ISO/DIS 639-3 code is 'hni'. |
LANG_HAW |
Hawaiian language selection. ISO/DIS 639-3 code is 'haw'. |
LANG_IDO |
Ido language selection. Constructed language. ISO/DIS 639-3 code is 'ido'. |
LANG_IND |
Indonesian language selection. ISO/DIS 639-3 code is 'ind'. |
LANG_INT |
Interlingua language selection. Constructed language. ISO/DIS 639-3 code is 'ina'. |
LANG_KAS |
Kashubian language selection. Spoken in Poland. ISO/DIS 639-3 code is 'csb'. |
LANG_KAW |
Kawa language selection. Alternate names area Wa, Va, Vo, Wa Pwo and Wakut. Spoken in China. ISO/DIS 639-3 code is 'wbm'. |
LANG_KIK |
Kikuyu language selection. Spoken in Kenya. ISO/DIS 639-3 code is 'kik'. |
LANG_KON |
Kongo language selection. This is a macro language of Laari and Kongo languages. Spoken in the Democratic Republic of the Congo, Angola and Congo. ISO/DIS 639-3 code is 'kon'. |
LANG_KPE |
Kpelle language selection. This is a macro language of Kpelle languages. Spoken in Liberia and Guinea. ISO/DIS 639-3 code is 'kpe'. |
LANG_KUR |
Kurdish language selection - if written in the latin alphabet. This is a macro language of the Kurdish languages ISO/DIS 639-3 code is 'kur'. |
LANG_LTN |
Latin language selection. ISO/DIS 639-3 code is 'lat'. |
LANG_LUB |
Luba language selection. Alternate names are Luba-Lulua, Luba-Kasai, Tshiluba, Luva and Western Luba. Spoken in the Democratic Republic of the Congo. ISO/DIS 639-3 code is 'lua'. |
LANG_LUX |
Luxembourgian language selection. Alternate names are Luxembourgeois and Letzburgish. Spoken in Luxembourg. ISO/DIS 639-3 code is 'ltz'. |
LANG_MLG |
Malagasy language selection. This is a macro language of Malagasy languages. Spoken in Madagascar. ISO/DIS 639-3 code is 'mlg'. |
LANG_MLY |
Malay language selection. ISO/DIS 639-3 code is 'msa'. |
LANG_MLN |
Malinke language selection. Alternate names are Western Maninkakan, Malinka and Maninga. Spoken in Senegal, Gambia and Mali. ISO/DIS 639-3 code is 'mlq'. |
LANG_MAO |
Maori language selection. Spoken in New Zealand. ISO/DIS 639-3 code is 'mri'. |
LANG_MAY |
Mayan language selection. This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'MYN'. |
LANG_MIA |
Miao language selection. This is a macro language of Hmong languages. Alternate name is Hmong. Spoken in China, Laos, Thailand, Myanmar and Viet Nam. ISO/DIS 639-3 code is 'hmn'. |
LANG_MIN |
Minankabaw language selection. ISO/DIS 639-3 code is 'min'. |
LANG_MOH |
Mohawk language selection. Spoken in Canada and USA. ISO/DIS 639-3 code is 'moh'. |
LANG_NAH |
Nahuatl language selection. This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'NAH'. |
LANG_NYA |
Nyanja language selection. Alternate names are Chichewa and Chinyanja. Spoken in Malawi, Mozambique, Zambia and Zimbabw. ISO/DIS 639-3 code is 'nya'. |
LANG_OCC |
Occidental language selection. Constructed language. ISO/DIS 639-3 code is 'occ'. |
LANG_OJI |
Ojibway language selection. This is a macro language of Ojibwa, Chippewa and Ottawa languages. Alternate names are Ojibwa and Ojibwe. Spoken in Canada and USA. ISO/DIS 639-3 code is 'oji'. |
LANG_PAP |
Papiamento language selection. Spoken in Netherlands Antilles, Aruba. ISO/DIS 639-3 code is 'pap'. |
LANG_PID |
Pidgin English language selection. Alternate names are Tok Pisin, Naomalanesian and New Guinean Pidgin English. Spoken in Papua New Guinea. ISO/DIS 639-3 code is 'tpi'. |
LANG_PRO |
Provencal language selection. Alternate name is Occitan. Spoken in France, Italy and Monaco. ISO/DIS 639-3 code is 'prv'. |
LANG_QUE |
Quechua language selection. This is a macro language of the Quechua languages. Spoken in Peru. ISO/DIS 639-3 code is 'que'. |
LANG_RHA |
Rhaetic language selection. Alternate names are Romansch and Rhaeto-Romance. Spoken in Switzerland. ISO/DIS 639-3 code is 'roh'. |
LANG_ROY |
Romany language selection. Spoken all over Europe. ISO/DIS 639-3 code is 'rom'. |
LANG_RUA |
Ruanda language selection. Alternate names are Kinyarwanda and Rwanda. Spoken in Rwanda, the Democratic Republic of Congo and Uganda. ISO/DIS 639-3 code is 'kin'. |
LANG_RUN |
Rundi language selection. Spoken in Burundi and Uganda. ISO/DIS 639-3 code is 'run'. |
LANG_SAM |
Samoan language selection. Spoken in Samoa and American Samoa. ISO/DIS 639-3 code is 'smo'. |
LANG_SAR |
Sardinian language selection. This is a macro language of the Sardinian languages. ISO/DIS 639-3 code is 'srd'. |
LANG_SHO |
Shona language selection. Spoken in Zimbabwe, Botswana and Zambia. ISO/DIS 639-3 code is 'sna'. |
LANG_SIO |
Sioux language selection. Alternate name is Dakota. Spoken in USA and Canada. ISO/DIS 639-3 code is 'dak'. |
LANG_SMI |
Sami language selection (Combination of the Sami language family). This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'SMI'. |
LANG_SML |
Lule Sami language selection. ISO/DIS 639-3 code is 'smj'. |
LANG_SMN |
Northern Sami language selection. ISO/DIS 639-3 code is 'sme'. |
LANG_SMS |
Southern Sami language selection. ISO/DIS 639-3 code is 'sma'. |
LANG_SOM |
Somali language selection. ISO/DIS 639-3 code is 'som'. |
LANG_SOT |
Sotho, Suto or Sesuto language selection. Spoken is Lesotho and South Africa. ISO/DIS 639-3 code is 'sot'. |
LANG_SUN |
Sundanese language selection. Alternate names are Sunda and Priangan. Spoken in Java and Bali in Indonesia. ISO/DIS 639-3 code is 'sun'. |
LANG_SWA |
Swahili language selection. This is a macro language of the Swahili languages. Spoken in the Democratic Republic of the Congo, Tanzania, Kenya and Somalia. ISO/DIS 639-3 code is 'swa'. |
LANG_SWZ |
Swazi language selection. Alternate names are Swati, Siswati and Tekela. Spoken in Swaziland, Lesotho, Mozambique and South Africa. ISO/DIS 639-3 code is 'ssw'. |
LANG_TAG |
Tagalog language selection. Spoken in Philippines. ISO/DIS 639-3 code is 'tgl'. |
LANG_TAH |
Tahitian language selection. ISO/DIS 639-3 code is 'tah'. |
LANG_TIN |
Pirez language selection. There is no language code for it in the ISO/DIS 639-3 standard. The CSDK uses the 'qti' local code. |
LANG_TON |
Tongan language selection. Alternate names are Tonga, Siska and Nyasa. Spoken in Malawi. ISO/DIS 639-3 code is 'ton'. |
LANG_TUN |
Tun language selection. Alternate names are Tunia and Tunya. Spoken in Chad. ISO/DIS 639-3 code is 'tug'. |
LANG_VIS |
Visayan language selection. The Visayan language actually consists of three languages: Cebuano, Hiligaynon and Samaran or Waray-waray. Spoken in the Philippines. There is no language code for it in the ISO/DIS 639-3 standard. The CSDK uses the 'qis' local code. |
LANG_WEL |
Welsh language selection. ISO/DIS 639-3 code is 'cym'. |
LANG_WEN |
Wend or Sorbian language selection. This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'WEN'. |
LANG_WOL |
Wolof language selection. Spoken in Senegal and Mauritania. ISO/DIS 639-3 code is 'wol'. |
LANG_XHO |
Xhosa language selection. Spoken in South Africa and Lesotho. ISO/DIS 639-3 code is 'xho'. |
LANG_ZAP |
Zapotec language selection. This is a macro language of the Zapotec languages. Spoken in Mexico. ISO/DIS 639-3 code is 'zap'. |
LANG_ZUL |
Zulu language selection. Spoken in South Africa, Lesotho, Malawi, Mozambique and Swaziland. ISO/DIS 639-3 code is 'zul'. |
LANG_JPN |
Japanese language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'jpn'. |
LANG_CHS |
Simplified Chinese language selection. This selection includes the characters of the English language, as well. There is no language code for this writing mode in the ISO/DIS 639-3 standard. The CSDK uses the 'qcs' local code. |
LANG_CHT |
Traditional Chinese language selection. This selection includes the characters of the English language, as well. There is no language code for this writing mode in the ISO/DIS 639-3 standard. The CSDK uses the 'qct' local code. |
LANG_KRN |
Korean language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'kor'. |
LANG_THA |
Thai language selection. ISO/DIS 639-3 code is 'tha'. This language is supported on: Windows, Linux, Mac OS X. |
LANG_ARA |
Arabic language selection. ISO/DIS 639-3 code is 'ara'. This language is supported on: Windows, Linux, Embedded Linux, Mac OS X. |
LANG_HEB |
Hebrew language selection. ISO/DIS 639-3 code is 'heb'. This language is supported on: Windows, Linux, Mac OS X. |
LANG_VIE |
Vietnamese (Latin) language selection. ISO/DIS 639-3 code is 'vie'. This language is supported on: Windows, Linux, Mac OS X. |
LANG_SIZE |
Number of directly selectable languages. |
enum MANAGE_LANG |
Language management actions.
This enum defines the possible management actions for the kRecManageLanguages function.
enum OUTCODEPAGETYPE |
Code page types.
Each output code page is classified into one of these categories.
enum RM_FLAGS |
Recognition Engines supporting a language.
These flags can be used to indicate the set of recognition engines supporting a given language. See LANGUAGE_INFO.
RECERR RECAPIKRN kRecCheckCodePage | ( | int | sid, |
LPWSTR | pMissingChrs, | ||
size_t | buflen | ||
) |
Checking the code page.
The kRecCheckCodePage checks whether the current Code Page setting contains all the characters of the current Language environment (language selection, the LanguagesPlus characters), and any characters listed as FilterPlus characters.
[in] | sid | Settings Collection ID. |
[out] | pMissingChrs | Pointer of a buffer to hold any missing characters returned by the function. |
[in] | buflen | Specifies the size of the buffer in bytes. It must be large enough to hold all the characters and the terminating double zero. |
RECERR |
RECERR kRecCheckCodePage(int sid, StringBuilder pMissingChrs);
RECERR RECAPIKRN kRecConvertCodePage2Unicode | ( | int | sid, |
const LPBYTE | pChar, | ||
size_t * | pBuffLen, | ||
LPWCH | pUniCode | ||
) |
Converting from the current code page to UNICODE.
This utility function converts a single character code from the current Code Page value to its UNICODE representation.
[in] | sid | Settings Collection ID. |
[in] | pChar | Character code to be converted. |
[in,out] | pBuffLen | Length of the input buffer. Upon returning it gets the exact length of the input code. |
[out] | pUniCode | Pointer of a variable to store the result of the conversion. |
RECERR |
RECERR kRecConvertCodePage2Unicode(int sid, byte[] pChar, out int pBuffLen, out char pUniCode);
RECERR RECAPIKRN kRecConvertUnicode2CodePage | ( | int | sid, |
WCHAR | UniCode, | ||
LPBYTE | pChar, | ||
size_t * | pBuffLen | ||
) |
Converting from UNICODE to the current code page.
This utility function converts a UNICODE character code to its representation in the current code page.
[in] | sid | Settings Collection ID. |
[in] | UniCode | Character code to be converted. |
[out] | pChar | Pointer of a variable to store the result of the conversion. |
[in,out] | pBuffLen | Length of the output buffer. Upon returning, it gets the exact length of the output code. |
RECERR |
pChar
is NULL the function also gives the required length for storing the output code and the return value is CHR_CODELENGTH_ERR. RECERR kRecConvertUnicode2CodePage(int sid, char UniCode, out byte[] pExport);
Searching for languages.
The kRecFindLanguage function searches for a single language with the given name.
[in] | pLangName | The name of the language. It can be an English name, an ISO 639-3, ISO 639-2/B or ISO 639-1 code, a Windows 3-letter code or a CSDK internal 3-letter code. |
[out] | pLanguage | The index of the first language. If no language can be found, LANG_NO will be put in this parameter. |
CHR_MULTIPLELANG_FOUND_WARN | More than one language has been found. The index of the most relevant language is put in the pLanguage parameter even in this case. Use one of the kRecFindLanguages or kRecFindLanguageEx functions to retrieve all the languages. |
RECERR | Other error |
RECERR kRecFindLanguage(string pLangName, out LANGUAGES pLanguage);
RECERR RECAPIKRN kRecFindLanguageEx | ( | LANGUAGE_CODE | coding, |
LPCTSTR | pLangName, | ||
LANGUAGES * | pLanguage, | ||
LANG_ENA * | pLanguages | ||
) |
Searching for languages.
The kRecFindLanguageEx function searches for the language or languages with the given name.
[in] | coding | Look for the language using all or one of the language coding standards. See LANGUAGE_CODE for details. |
[in] | pLangName | The name of the language to find. |
[out] | pLanguage | Returns the index of the found language. If no language can be found, LANG_NO will be put in this parameter. |
[out] | pLanguages | This parameter can be NULL. If not NULL, must point to an array having LANG_SIZE elements. All the languages matching pLangName by the required language coding are enabled in this array. |
CHR_MULTIPLELANG_FOUND_WARN | More than one language has been found. The index of the most relevant language is put in the pLanguage parameter even in this case, while pLanguages (if not NULL) will contain all of them as LANG_ENABLED. |
RECERR | Other error |
RECERR kRecFindLanguageEx(LANGUAGE_CODE coding, string pLangName, out LANGUAGES pLanguage, out LANG_ENA[] pLanguages); // or when pLanguages is NULL in C/C++ RECERR kRecFindLanguageEx(LANGUAGE_CODE coding, string pLangName, out LANGUAGES pLanguage);
RECERR RECAPIKRN kRecFindLanguages | ( | const LANGUAGE_INFO * | pInfo, |
LANG_ENA * | pLanguages | ||
) |
Searching for languages.
The kRecFindLanguages function collects languages according to the given language information.
[in] | pInfo | Pointer to a structure containing the filter information. If a field is zero or an empty string, it does not affect the filtering. If a field has some real value, the only languages defined by that value will be reported on. The EnglishName field may contain not only the English name of the language, but the language identifier for any of the following standards: ISO 639-3, ISO 639-2/B, ISO 639-1, Windows 3-letter code or a CSDK internal 3-letter code. |
[out] | pLanguages | Pointer to an array having LANG_SIZE elements to give back whether a language is selected or not. |
RECERR |
RECERR kRecFindLanguages(LANGUAGE_INFO pInfo, out LANG_ENA[] pLanguages);
RECERR RECAPIKRN kRecGetCodePage | ( | int | sid, |
LPTSTR | pCodePageName, | ||
size_t | buflen | ||
) |
Getting the code page name.
The kRecGetCodePage function will provide the current Code Page name.
[in] | sid | Settings Collection ID. |
[out] | pCodePageName | Pointer of a buffer for the current setting. |
[in] | buflen | Specifies the size of the buffer in bytes. |
RECERR |
RECERR kRecGetCodePage(int sid, out string pCodePageName);
RECERR RECAPIKRN kRecGetCodePageInfo | ( | LPCTSTR | pCodePageName, |
LPTSTR | pDesc, | ||
size_t | size, | ||
LPOUTCODEPAGETYPE | pCodePageType | ||
) |
Getting information about the code page.
The kRecGetCodePageInfo function provides information about the specified Code Page: a descriptive string and the category of the Code Page.
[in] | pCodePageName | Name of the Code Page inquired. |
[out] | pDesc | Pointer of a buffer to hold the Code Page descriptor information. |
[in] | size | Specifies the size of the buffer pDesc , in bytes. (MAXCPNAMELEN is recommended) |
[out] | pCodePageType | Pointer of a OUTCODEPAGETYPE variable to hold basic Code Page category information. |
RECERR |
RECERR kRecGetCodePageInfo(string pCodePageName, out string pDesc, out OUTCODEPAGETYPE pCodePageType);
RECERR RECAPIKRN kRecGetDefaultFilter | ( | int | sid, |
CHR_FILTER * | pGlfilter | ||
) |
Getting the global character set filter.
The kRecGetDefaultFilter function inquires the current Global filter setting. The result will be the binary OR-ed combination of one or more disjunct members of CHR_FILTER.
[in] | sid | Settings Collection ID. |
[out] | pGlfilter | Pointer of a variable to get the current Global filter setting. |
RECERR |
RECERR kRecGetDefaultFilter(int sid, out CHR_FILTER filter);
RECERR RECAPIKRN kRecGetFilterPlus | ( | int | sid, |
LPWSTR | pFilterPlus, | ||
size_t | iSize | ||
) |
Getting FilterPlus characters.
The kRecGetFilterPlus gets the FilterPlus characters setting.
[in] | sid | Settings Collection ID. |
[out] | pFilterPlus | Pointer of a buffer to get the current FilterPlus character setting in UNICODE. |
[in] | iSize | Specifies the size of the buffer in bytes. It must be large enough to hold all the characters and the terminating wide-character zero. |
RECERR |
RECERR kRecGetFilterPlus(int sid, StringBuilder pFilterPlus);
RECERR RECAPIKRN kRecGetFirstCodePage | ( | LPTSTR | pCodePageName, |
size_t | buflen | ||
) |
Starting enumeration of code pages.
The kRecGetFirstCodePage function together with the kRecGetNextCodePage creates a listing of the available Code Pages.
[out] | pCodePageName | Pointer of a buffer for the name of the first available Code Page. |
[in] | buflen | Specifies the size of the buffer in bytes. |
RECERR |
*.SET
. The OmniPage CSDK is shipped with the Code Page Definition file, called RECOGN.SET
. RECERR kRecGetFirstCodePage(out string pCodePageName);
kRecGetFirstCodePage
and kRecGetNextCodePage in C#: RECERR kRecGetAllCodePages(out string[] codepages);
RECERR RECAPIKRN kRecGetLanguageInfo | ( | LANGUAGES | lang, |
LANGUAGE_INFO * | pInfo | ||
) |
Getting information about a language.
The kRecGetLanguageInfo function inquires information about a language.
[in] | lang | The ID of the inquired language. |
[out] | pInfo | Pointer of a structure to give back information about the chosen language. |
RECERR |
RECERR kRecGetLanguageInfo(LANGUAGES lang, out LANGUAGE_INFO pInfo);
Getting languages.
The kRecGetLanguages function inquires the current language selection.
[in] | sid | Settings Collection ID. |
[out] | pLanguages | Pointer to an array to get the current language selection. The size of the array must be LANG_SIZE. Each element of this array represents a language from LANGUAGES. |
RECERR |
RECERR kRecGetLanguages(int sid, LANG_ENA[] pLanguages);
RECERR RECAPIKRN kRecGetLanguagesPlus | ( | int | sid, |
LPWSTR | pOcrLplus, | ||
size_t | iBSize | ||
) |
Getting LanguagesPlus characters.
The kRecGetLanguagesPlus function inquires the current LanguagesPlus characters setting.
[in] | sid | Settings Collection ID. |
[out] | pOcrLplus | Pointer to a buffer to get the current LanguagesPlus character setting in UNICODE. |
[in] | iBSize | Specifies the size of the buffer in bytes. It must be large enough to hold all the characters and the terminating wide-character zero. |
RECERR |
RECERR kRecGetLanguagesPlus(int sid, StringBuilder pOcrLplus);
RECERR RECAPIKRN kRecGetMissingSymbol | ( | int | sid, |
LPWCH | pwMiss | ||
) |
Getting missing symbol character.
The kRecGetMissingSymbol function inquires the current missing symbol setting.
[in] | sid | Settings Collection ID. |
[out] | pwMiss | Pointer of a variable to get the missing symbol setting. |
RECERR |
RECERR kRecGetMissingSymbol(int sid, out char wMiss);
RECERR RECAPIKRN kRecGetNextCodePage | ( | LPTSTR | pCodePageName, |
size_t | buflen | ||
) |
Performing enumeration of code pages.
The kRecGetNextCodePage function together with the kRecGetFirstCodePage creates a listing of the available Code Pages.
[out] | pCodePageName | Pointer of a buffer for the name of the first available Code Page. |
[in] | buflen | Specifies the size of the buffer in bytes. |
RECERR |
REC_OK
each time. As soon as the function finds no further item to get, it returns with CHR_NOMORE_WARN
, signaling that the list is complete. RECERR kRecGetNextCodePage(out string pCodePageName);
kRecGetNextCodePage
in C#: RECERR kRecGetAllCodePages(out string[] codepages);
Getting languages of the page.
The kRecGetPageLanguages function inquires the language selection for a given page
[in] | hPage | Handle of the page. |
[out] | pOcrLanguages | Pointer to an array to get the language selection. The size of the array must be LANG_SIZE. Each element of this array represents a language from LANGUAGES. |
RECERR |
RECERR kRecGetPageLanguages(IntPtr hPage, LANG_ENA[] pLanguages);
RECERR RECAPIKRN kRecGetRejectionSymbol | ( | int | sid, |
LPWCH | pwRej | ||
) |
Getting the rejection symbol character.
The kRecGetRejectionSymbol function inquires the current rejection symbol setting.
[in] | sid | Settings Collection ID. |
[out] | pwRej | Pointer of a variable to get the rejection symbol setting. |
RECERR |
RECERR kRecGetRejectionSymbol(int sid, out char pwRej);
RECERR RECAPIKRN kRecGetSingleLanguageDetection | ( | int | sid, |
INTBOOL * | pbEnable | ||
) |
Getting the single language detection flag.
The kRecGetSingleLanguageDetection function retrieves the value of the Automatic Single Language Detection setting.
[in] | sid | Settings Collection ID. |
[in] | pbEnable | Pointer of a variable to store the single language detection flag. |
RECERR |
RECERR kRecGetSingleLanguageDetection(int sid, out bool bEnable);
RECERR RECAPIKRN kRecManageLanguages | ( | int | sid, |
MANAGE_LANG | action, | ||
LANGUAGES | language | ||
) |
Managing enabled languages.
The kRecManageLanguages function performs some basic management actions (Set, Add, Remove, Invert, Is Enabled) on the Language environment.
[in] | sid | Settings Collection ID. |
[in] | action | The management action to perform. See the MANAGE_LANG enum. |
[in] | language | The single language, or a language set, to enable, disable, or inquire. See the LANGUAGES enum. |
RECERR |
rc = kRecManageLanguages(sid, SET_LANG, LANG_ENG); rc = kRecManageLanguages(sid, ADD_LANG, LANG_GER);
kRecManageLanguages
returns REC_OK if the language is enabled while it returns CHR_LANG_DISABLED_WARN if disabled. If the language parameter is a language set identifier (like LANG_ALL_ASIAN
) REC_OK
is returned if at least one of the languages in the set is enabled. RECERR kRecManageLanguages(int sid, MANAGE_LANG action, LANGUAGES language);
RECERR RECAPIKRN kRecSetCodePage | ( | int | sid, |
LPCTSTR | pCodePageName | ||
) |
Setting the code page.
The kRecSetCodePage function specifies the Code Page setting of the Engine.
[in] | sid | Settings Collection ID. |
[in] | pCodePageName | Name of the Code Page to be set. The available Code Pages can be learnt using the kRecGetFirstCodePage and kRecGetNextCodePage function-pair. Auto code page can be selected by NULL or empty string. |
RECERR |
"Unicode"
or "UTF-8"
. RECERR kRecSetCodePage(int sid, string pCodePageName);
RECERR RECAPIKRN kRecSetDefaultFilter | ( | int | sid, |
CHR_FILTER | Glfilter | ||
) |
Changing global character set filter.
The kRecSetDefaultFilter function specifies the Global filter, i.e. a Character Set filter which will be applied globally, at page level. If this function is not called by the integrating application after the Engine's initialization, the value FILTER_ALL is applied, i.e. the Language environment will not be filtered globally. The Global filter setting is applied for all zones having the FILTER_DEFAULT in their ZONE::filter field.
[in] | sid | Settings Collection ID. |
[in] | Glfilter | Global Character Set filter to be applied. |
RECERR |
HPAGE hPage; int sid = 0; // Settings Collection ID . . . rc = kRecManageLanguages(sid, SET_LANG, LANG_ENG); rc = kRecLocateZones(sid, hPage, NULL); // The function above locates the zones and gives them the FILTER_DEFAULT // attribute kRecSetDefaultFilter(sid, (CHR_FILTER)(FILTER_UPPERCASE | FILTER_DIGIT));
RECERR kRecSetDefaultFilter(int sid, CHR_FILTER filter);
RECERR RECAPIKRN kRecSetFilterPlus | ( | int | sid, |
LPCWSTR | pFilterPlus | ||
) |
Setting FilterPlus characters.
The kRecSetFilterPlus function specifies a set of individual characters, the FilterPlus characters. The FilterPlus characters can broaden the filtered set of characters globally or on a per-zone basis. To allow the use of these FilterPlus characters, the zone's ZONE::filter field should have the FILTER_PLUS value enabled.
[in] | sid | Settings Collection ID. |
[in] | pFilterPlus | Pointer of a UNICODE string containing the FilterPlus characters to be set. (The string is terminated with a double zero.) |
RECERR |
RECERR kRecSetFilterPlus(int sid, string pFilterPlus);
Setting languages.
The kRecSetLanguages function defines the main part of the Language environment of the Character Set. The available languages are represented by the LANGUAGES enum.
[in] | sid | Settings Collection ID. |
[in] | pLanguages | Address of a LANG_ENA array containing the enabled/disabled information for each language available. |
RECERR |
LANG_ENA pLang[LANG_SIZE]; for (int i=0; i<LANG_SIZE; i++) { pLang[i] = LANG_DISABLED; } pLang[LANG_ENG] = LANG_ENABLED; pLang[LANG_GER] = LANG_ENABLED; rc = kRecSetLanguages(sid, pLang);
RECERR kRecSetLanguages(int sid, LANG_ENA[] pLanguages);
RECERR RECAPIKRN kRecSetLanguagesPlus | ( | int | sid, |
LPCWSTR | pOcrLplus | ||
) |
Setting LanguagesPlus characters.
The kRecSetLanguagesPlus function specifies some individual characters, the LanguagesPlus characters. The set of LanguagesPlus characters is added to the set of characters determined by the language selection (kRecSetLanguages). The resulting set of characters is called the Language environment.
[in] | sid | Settings Collection ID. |
[in] | pOcrLplus | Pointer to a UNICODE string containing the LanguagesPlus characters to be set. (The string is terminated with a wide-character zero.) Page Characters and Code Pages collects all the character codes that can be used here. |
RECERR |
WCHAR *pLangPlus = L"éÉ"; rc = kRecManageLanguages(sid, SET_LANG, LANG_GER); rc = kRecSetLanguagesPlus(sid, pLangPlus);
RECERR kRecSetLanguagesPlus(int sid, string pOcrLplus);
RECERR RECAPIKRN kRecSetMissingSymbol | ( | int | sid, |
WCHAR | wMiss | ||
) |
Setting the missing symbol character.
The kRecSetMissingSymbol specifies the code of the missing symbol. The missing symbol is a special character that replaces any character that was recognized by the Engine but could not be represented in the final output document, since the character does not exist in the current Code Page.
[in] | sid | Settings Collection ID. |
[in] | wMiss | The missing symbol to be set. |
RECERR |
CHR_MISSINGEXPORT_ERR
error code is returned. RECERR kRecSetMissingSymbol(int sid, char wMiss);
RECERR RECAPIKRN kRecSetRejectionSymbol | ( | int | sid, |
WCHAR | wRej | ||
) |
Setting the rejection symbol character.
The kRecSetRejectionSymbol function specifies which character is to be used as a symbol for the rejected characters (i.e. unrecognized by the recognition module used in the zone) in the final output document.
[in] | sid | Settings Collection ID. |
[in] | wRej | The rejection symbol to be set. |
RECERR |
CHR_MISSINGEXPORT_ERR
error code is returned. RECERR kRecSetRejectionSymbol(int sid, char wRej);
RECERR RECAPIKRN kRecSetSingleLanguageDetection | ( | int | sid, |
INTBOOL | bEnable | ||
) |
Automatic Single Language Detection.
The kRecSetSingleLanguageDetection function enables or disables Automatic Single Language Detection mode.
[in] | sid | Settings Collection ID. |
[in] | bEnable | Flag that indicates whether Automatic Single Language Detection is enabled or disabled. |
RECERR |
rc = kRecSetSingleLanguageDetection(sid, TRUE); rc = kRecManageLanguages(sid, SET_LANG, LANG_ALL_LATIN); rc = kRecManageLanguages(sid, ADD_LANG, LANG_ALL_ASIAN);
rc = kRecSetSingleLanguageDetection(sid, TRUE); rc = kRecManageLanguages(sid, SET_LANG, LANG_NO); rc = kRecManageLanguages(sid, ADD_LANG, LANG_ENG); rc = kRecManageLanguages(sid, ADD_LANG, LANG_GER); rc = kRecManageLanguages(sid, ADD_LANG, LANG_FRE); rc = kRecManageLanguages(sid, ADD_LANG, LANG_JPN); rc = kRecManageLanguages(sid, ADD_LANG, LANG_KRN);
RECERR kRecSetSingleLanguageDetection(int sid, bool bEnable);