blob: 97687836795fdea2eb95451287c078d23c037360 [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **********************************************************************
5 * Copyright (C) 2005-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10#ifndef __CSRSBCS_H
11#define __CSRSBCS_H
12
13#include "unicode/uobject.h"
14
15#if !UCONFIG_NO_CONVERSION
16
17#include "csrecog.h"
18
19U_NAMESPACE_BEGIN
20
21class NGramParser : public UMemory
22{
23private:
24 int32_t ngram;
25 const int32_t *ngramList;
26
27 int32_t ngramCount;
28 int32_t hitCount;
29
30protected:
31 int32_t byteIndex;
32 const uint8_t *charMap;
33
34 void addByte(int32_t b);
35
36public:
37 NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
38 virtual ~NGramParser();
39
40private:
41 /*
42 * Binary search for value in table, which must have exactly 64 entries.
43 */
44 int32_t search(const int32_t *table, int32_t value);
45
46 void lookup(int32_t thisNgram);
47
48 virtual int32_t nextByte(InputText *det);
49 virtual void parseCharacters(InputText *det);
50
51public:
52 int32_t parse(InputText *det);
53
54};
55
56#if !UCONFIG_ONLY_HTML_CONVERSION
57class NGramParser_IBM420 : public NGramParser
58{
59public:
60 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
61 ~NGramParser_IBM420();
62
63private:
64 int32_t alef;
65 int32_t isLamAlef(int32_t b);
66 int32_t nextByte(InputText *det);
67 void parseCharacters(InputText *det);
68};
69#endif
70
71
72class CharsetRecog_sbcs : public CharsetRecognizer
73{
74public:
75 CharsetRecog_sbcs();
76 virtual ~CharsetRecog_sbcs();
77 virtual const char *getName() const = 0;
78 virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
79 virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
80};
81
82class CharsetRecog_8859_1 : public CharsetRecog_sbcs
83{
84public:
85 virtual ~CharsetRecog_8859_1();
86 const char *getName() const;
87 virtual UBool match(InputText *det, CharsetMatch *results) const;
88};
89
90class CharsetRecog_8859_2 : public CharsetRecog_sbcs
91{
92public:
93 virtual ~CharsetRecog_8859_2();
94 const char *getName() const;
95 virtual UBool match(InputText *det, CharsetMatch *results) const;
96};
97
98class CharsetRecog_8859_5 : public CharsetRecog_sbcs
99{
100public:
101 virtual ~CharsetRecog_8859_5();
102 const char *getName() const;
103};
104
105class CharsetRecog_8859_6 : public CharsetRecog_sbcs
106{
107public:
108 virtual ~CharsetRecog_8859_6();
109
110 const char *getName() const;
111};
112
113class CharsetRecog_8859_7 : public CharsetRecog_sbcs
114{
115public:
116 virtual ~CharsetRecog_8859_7();
117
118 const char *getName() const;
119};
120
121class CharsetRecog_8859_8 : public CharsetRecog_sbcs
122{
123public:
124 virtual ~CharsetRecog_8859_8();
125
126 virtual const char *getName() const;
127};
128
129class CharsetRecog_8859_9 : public CharsetRecog_sbcs
130{
131public:
132 virtual ~CharsetRecog_8859_9();
133
134 const char *getName() const;
135};
136
137
138
139class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
140{
141public:
142 virtual ~CharsetRecog_8859_5_ru();
143
144 const char *getLanguage() const;
145
146 virtual UBool match(InputText *det, CharsetMatch *results) const;
147};
148
149class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
150{
151public:
152 virtual ~CharsetRecog_8859_6_ar();
153
154 const char *getLanguage() const;
155
156 virtual UBool match(InputText *det, CharsetMatch *results) const;
157};
158
159class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
160{
161public:
162 virtual ~CharsetRecog_8859_7_el();
163
164 const char *getLanguage() const;
165
166 virtual UBool match(InputText *det, CharsetMatch *results) const;
167};
168
169class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
170{
171public:
172 virtual ~CharsetRecog_8859_8_I_he();
173
174 const char *getName() const;
175
176 const char *getLanguage() const;
177
178 virtual UBool match(InputText *det, CharsetMatch *results) const;
179};
180
181class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
182{
183public:
184 virtual ~CharsetRecog_8859_8_he ();
185
186 const char *getLanguage() const;
187
188 virtual UBool match(InputText *det, CharsetMatch *results) const;
189};
190
191class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
192{
193public:
194 virtual ~CharsetRecog_8859_9_tr ();
195
196 const char *getLanguage() const;
197
198 virtual UBool match(InputText *det, CharsetMatch *results) const;
199};
200
201class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
202{
203public:
204 virtual ~CharsetRecog_windows_1256();
205
206 const char *getName() const;
207
208 const char *getLanguage() const;
209
210 virtual UBool match(InputText *det, CharsetMatch *results) const;
211};
212
213class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
214{
215public:
216 virtual ~CharsetRecog_windows_1251();
217
218 const char *getName() const;
219
220 const char *getLanguage() const;
221
222 virtual UBool match(InputText *det, CharsetMatch *results) const;
223};
224
225
226class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
227{
228public:
229 virtual ~CharsetRecog_KOI8_R();
230
231 const char *getName() const;
232
233 const char *getLanguage() const;
234
235 virtual UBool match(InputText *det, CharsetMatch *results) const;
236};
237
238#if !UCONFIG_ONLY_HTML_CONVERSION
239class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
240{
241public:
242 virtual ~CharsetRecog_IBM424_he();
243
244 const char *getLanguage() const;
245};
246
247class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
248public:
249 virtual ~CharsetRecog_IBM424_he_rtl();
250
251 const char *getName() const;
252
253 virtual UBool match(InputText *det, CharsetMatch *results) const;
254};
255
256class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
257 virtual ~CharsetRecog_IBM424_he_ltr();
258
259 const char *getName() const;
260
261 virtual UBool match(InputText *det, CharsetMatch *results) const;
262};
263
264class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
265{
266public:
267 virtual ~CharsetRecog_IBM420_ar();
268
269 const char *getLanguage() const;
270 int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
271
272};
273
274class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
275public:
276 virtual ~CharsetRecog_IBM420_ar_rtl();
277
278 const char *getName() const;
279
280 virtual UBool match(InputText *det, CharsetMatch *results) const;
281};
282
283class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
284 virtual ~CharsetRecog_IBM420_ar_ltr();
285
286 const char *getName() const;
287
288 virtual UBool match(InputText *det, CharsetMatch *results) const;
289};
290#endif
291
292U_NAMESPACE_END
293
294#endif /* !UCONFIG_NO_CONVERSION */
295#endif /* __CSRSBCS_H */