Blame - libicu/cts_headers/csrmbcs.h - platform/external/icu

blob: 8ccf1d56a95f65e50f9d60351d896b0c07b3c1b1 [file] [log] [blame]

Victor Chang	7322950	2020-09-17 13:39:19 +0100	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
				3	/*
				4	**********************************************************************
				5	* Copyright (C) 2005-2012, International Business Machines
				6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	*/
				9
				10	#ifndef __CSRMBCS_H
				11	#define __CSRMBCS_H
				12
				13	#include "unicode/utypes.h"
				14
				15	#if !UCONFIG_NO_CONVERSION
				16
				17	#include "csrecog.h"
				18
				19	U_NAMESPACE_BEGIN
				20
				21	// "Character" iterated character class.
				22	// Recognizers for specific mbcs encodings make their "characters" available
				23	// by providing a nextChar() function that fills in an instance of IteratedChar
				24	// with the next char from the input.
				25	// The returned characters are not converted to Unicode, but remain as the raw
				26	// bytes (concatenated into an int) from the codepage data.
				27	//
				28	// For Asian charsets, use the raw input rather than the input that has been
				29	// stripped of markup. Detection only considers multi-byte chars, effectively
				30	// stripping markup anyway, and double byte chars do occur in markup too.
				31	//
				32	class IteratedChar : public UMemory
				33	{
				34	public:
				35	uint32_t charValue; // 1-4 bytes from the raw input data
				36	int32_t index;
				37	int32_t nextIndex;
				38	UBool error;
				39	UBool done;
				40
				41	public:
				42	IteratedChar();
				43	//void reset();
				44	int32_t nextByte(InputText* det);
				45	};
				46
				47
				48	class CharsetRecog_mbcs : public CharsetRecognizer {
				49
				50	protected:
				51	/**
				52	* Test the match of this charset with the input text data
				53	* which is obtained via the CharsetDetector object.
				54	*
				55	* @param det The CharsetDetector, which contains the input text
				56	* to be checked for being in this charset.
				57	* @return Two values packed into one int (Damn java, anyhow)
				58	* <br/>
				59	* bits 0-7: the match confidence, ranging from 0-100
				60	* <br/>
				61	* bits 8-15: The match reason, an enum-like value.
				62	*/
				63	int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
				64
				65	public:
				66
				67	virtual ~CharsetRecog_mbcs();
				68
				69	/**
				70	* Get the IANA name of this charset.
				71	* @return the charset name.
				72	*/
				73
				74	const char *getName() const = 0;
				75	const char *getLanguage() const = 0;
				76	UBool match(InputText* input, CharsetMatch *results) const = 0;
				77
				78	/**
				79	* Get the next character (however many bytes it is) from the input data
				80	* Subclasses for specific charset encodings must implement this function
				81	* to get characters according to the rules of their encoding scheme.
				82	*
				83	* This function is not a method of class IteratedChar only because
				84	* that would require a lot of extra derived classes, which is awkward.
				85	* @param it The IteratedChar "struct" into which the returned char is placed.
				86	* @param det The charset detector, which is needed to get at the input byte data
				87	* being iterated over.
				88	* @return True if a character was returned, false at end of input.
				89	*/
				90	virtual UBool nextChar(IteratedChar it, InputText textIn) const = 0;
				91
				92	};
				93
				94
				95	/**
				96	* Shift-JIS charset recognizer.
				97	*
				98	*/
				99	class CharsetRecog_sjis : public CharsetRecog_mbcs {
				100	public:
				101	virtual ~CharsetRecog_sjis();
				102
				103	UBool nextChar(IteratedChar it, InputText det) const;
				104
				105	UBool match(InputText* input, CharsetMatch *results) const;
				106
				107	const char *getName() const;
				108	const char *getLanguage() const;
				109
				110	};
				111
				112
				113	/**
				114	* EUC charset recognizers. One abstract class that provides the common function
				115	* for getting the next character according to the EUC encoding scheme,
				116	* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
				117	*
				118	*/
				119	class CharsetRecog_euc : public CharsetRecog_mbcs
				120	{
				121	public:
				122	virtual ~CharsetRecog_euc();
				123
				124	const char *getName() const = 0;
				125	const char *getLanguage() const = 0;
				126
				127	UBool match(InputText* input, CharsetMatch *results) const = 0;
				128	/*
				129	* (non-Javadoc)
				130	* Get the next character value for EUC based encodings.
				131	* Character "value" is simply the raw bytes that make up the character
				132	* packed into an int.
				133	*/
				134	UBool nextChar(IteratedChar it, InputText det) const;
				135	};
				136
				137	/**
				138	* The charset recognize for EUC-JP. A singleton instance of this class
				139	* is created and kept by the public CharsetDetector class
				140	*/
				141	class CharsetRecog_euc_jp : public CharsetRecog_euc
				142	{
				143	public:
				144	virtual ~CharsetRecog_euc_jp();
				145
				146	const char *getName() const;
				147	const char *getLanguage() const;
				148
				149	UBool match(InputText* input, CharsetMatch *results) const;
				150	};
				151
				152	/**
				153	* The charset recognize for EUC-KR. A singleton instance of this class
				154	* is created and kept by the public CharsetDetector class
				155	*/
				156	class CharsetRecog_euc_kr : public CharsetRecog_euc
				157	{
				158	public:
				159	virtual ~CharsetRecog_euc_kr();
				160
				161	const char *getName() const;
				162	const char *getLanguage() const;
				163
				164	UBool match(InputText* input, CharsetMatch *results) const;
				165	};
				166
				167	/**
				168	*
				169	* Big5 charset recognizer.
				170	*
				171	*/
				172	class CharsetRecog_big5 : public CharsetRecog_mbcs
				173	{
				174	public:
				175	virtual ~CharsetRecog_big5();
				176
				177	UBool nextChar(IteratedChar* it, InputText* det) const;
				178
				179	const char *getName() const;
				180	const char *getLanguage() const;
				181
				182	UBool match(InputText* input, CharsetMatch *results) const;
				183	};
				184
				185
				186	/**
				187	*
				188	* GB-18030 recognizer. Uses simplified Chinese statistics.
				189	*
				190	*/
				191	class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
				192	{
				193	public:
				194	virtual ~CharsetRecog_gb_18030();
				195
				196	UBool nextChar(IteratedChar* it, InputText* det) const;
				197
				198	const char *getName() const;
				199	const char *getLanguage() const;
				200
				201	UBool match(InputText* input, CharsetMatch *results) const;
				202	};
				203
				204	U_NAMESPACE_END
				205
				206	#endif
				207	#endif /* __CSRMBCS_H */