| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ***************************************************************** |
| * Copyright (c) 2002-2014, International Business Machines Corporation |
| * and others. All Rights Reserved. |
| ***************************************************************** |
| * Date Name Description |
| * 06/06/2002 aliu Creation. |
| ***************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_TRANSLITERATION |
| |
| #include "unicode/uobject.h" |
| #include "unicode/uscript.h" |
| |
| #include "anytrans.h" |
| #include "hash.h" |
| #include "mutex.h" |
| #include "nultrans.h" |
| #include "putilimp.h" |
| #include "tridpars.h" |
| #include "uinvchar.h" |
| #include "uvector.h" |
| |
| //------------------------------------------------------------ |
| // Constants |
| |
| static const UChar TARGET_SEP = 45; // '-' |
| static const UChar VARIANT_SEP = 47; // '/' |
| static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any" |
| static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" |
| static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-" |
| |
| // initial size for an Any-XXXX transform's cache of script-XXXX transforms |
| // (will grow as necessary, but we don't expect to have source text with more than 7 scripts) |
| #define ANY_TRANS_CACHE_INIT_SIZE 7 |
| |
| //------------------------------------------------------------ |
| |
| U_CDECL_BEGIN |
| /** |
| * Deleter function for Transliterator*. |
| */ |
| static void U_CALLCONV |
| _deleteTransliterator(void *obj) { |
| delete (icu::Transliterator*) obj; |
| } |
| U_CDECL_END |
| |
| //------------------------------------------------------------ |
| |
| U_NAMESPACE_BEGIN |
| |
| //------------------------------------------------------------ |
| // ScriptRunIterator |
| |
| /** |
| * Returns a series of ranges corresponding to scripts. They will be |
| * of the form: |
| * |
| * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second |
| * | | - first run (start, limit) |
| * | | - second run (start, limit) |
| * |
| * That is, the runs will overlap. The reason for this is so that a |
| * transliterator can consider common characters both before and after |
| * the scripts. |
| */ |
| class ScriptRunIterator : public UMemory { |
| private: |
| const Replaceable& text; |
| int32_t textStart; |
| int32_t textLimit; |
| |
| public: |
| /** |
| * The code of the current run, valid after next() returns. May |
| * be USCRIPT_INVALID_CODE if and only if the entire text is |
| * COMMON/INHERITED. |
| */ |
| UScriptCode scriptCode; |
| |
| /** |
| * The start of the run, inclusive, valid after next() returns. |
| */ |
| int32_t start; |
| |
| /** |
| * The end of the run, exclusive, valid after next() returns. |
| */ |
| int32_t limit; |
| |
| /** |
| * Constructs a run iterator over the given text from start |
| * (inclusive) to limit (exclusive). |
| */ |
| ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); |
| |
| /** |
| * Returns TRUE if there are any more runs. TRUE is always |
| * returned at least once. Upon return, the caller should |
| * examine scriptCode, start, and limit. |
| */ |
| UBool next(); |
| |
| /** |
| * Adjusts internal indices for a change in the limit index of the |
| * given delta. A positive delta means the limit has increased. |
| */ |
| void adjustLimit(int32_t delta); |
| |
| private: |
| ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class |
| ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class |
| }; |
| |
| ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, |
| int32_t myStart, int32_t myLimit) : |
| text(theText) |
| { |
| textStart = myStart; |
| textLimit = myLimit; |
| limit = myStart; |
| } |
| |
| UBool ScriptRunIterator::next() { |
| UChar32 ch; |
| UScriptCode s; |
| UErrorCode ec = U_ZERO_ERROR; |
| |
| scriptCode = USCRIPT_INVALID_CODE; // don't know script yet |
| start = limit; |
| |
| // Are we done? |
| if (start == textLimit) { |
| return FALSE; |
| } |
| |
| // Move start back to include adjacent COMMON or INHERITED |
| // characters |
| while (start > textStart) { |
| ch = text.char32At(start - 1); // look back |
| s = uscript_getScript(ch, &ec); |
| if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { |
| --start; |
| } else { |
| break; |
| } |
| } |
| |
| // Move limit ahead to include COMMON, INHERITED, and characters |
| // of the current script. |
| while (limit < textLimit) { |
| ch = text.char32At(limit); // look ahead |
| s = uscript_getScript(ch, &ec); |
| if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { |
| if (scriptCode == USCRIPT_INVALID_CODE) { |
| scriptCode = s; |
| } else if (s != scriptCode) { |
| break; |
| } |
| } |
| ++limit; |
| } |
| |
| // Return TRUE even if the entire text is COMMON / INHERITED, in |
| // which case scriptCode will be USCRIPT_INVALID_CODE. |
| return TRUE; |
| } |
| |
| void ScriptRunIterator::adjustLimit(int32_t delta) { |
| limit += delta; |
| textLimit += delta; |
| } |
| |
| //------------------------------------------------------------ |
| // AnyTransliterator |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) |
| |
| AnyTransliterator::AnyTransliterator(const UnicodeString& id, |
| const UnicodeString& theTarget, |
| const UnicodeString& theVariant, |
| UScriptCode theTargetScript, |
| UErrorCode& ec) : |
| Transliterator(id, NULL), |
| targetScript(theTargetScript) |
| { |
| cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); |
| if (U_FAILURE(ec)) { |
| return; |
| } |
| uhash_setValueDeleter(cache, _deleteTransliterator); |
| |
| target = theTarget; |
| if (theVariant.length() > 0) { |
| target.append(VARIANT_SEP).append(theVariant); |
| } |
| } |
| |
| AnyTransliterator::~AnyTransliterator() { |
| uhash_close(cache); |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : |
| Transliterator(o), |
| target(o.target), |
| targetScript(o.targetScript) |
| { |
| // Don't copy the cache contents |
| UErrorCode ec = U_ZERO_ERROR; |
| cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); |
| if (U_FAILURE(ec)) { |
| return; |
| } |
| uhash_setValueDeleter(cache, _deleteTransliterator); |
| } |
| |
| /** |
| * Transliterator API. |
| */ |
| AnyTransliterator* AnyTransliterator::clone() const { |
| return new AnyTransliterator(*this); |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, |
| UBool isIncremental) const { |
| int32_t allStart = pos.start; |
| int32_t allLimit = pos.limit; |
| |
| ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); |
| |
| while (it.next()) { |
| // Ignore runs in the ante context |
| if (it.limit <= allStart) continue; |
| |
| // Try to instantiate transliterator from it.scriptCode to |
| // our target or target/variant |
| Transliterator* t = getTransliterator(it.scriptCode); |
| |
| if (t == NULL) { |
| // We have no transliterator. Do nothing, but keep |
| // pos.start up to date. |
| pos.start = it.limit; |
| continue; |
| } |
| |
| // If the run end is before the transliteration limit, do |
| // a non-incremental transliteration. Otherwise do an |
| // incremental one. |
| UBool incremental = isIncremental && (it.limit >= allLimit); |
| |
| pos.start = uprv_max(allStart, it.start); |
| pos.limit = uprv_min(allLimit, it.limit); |
| int32_t limit = pos.limit; |
| t->filteredTransliterate(text, pos, incremental); |
| int32_t delta = pos.limit - limit; |
| allLimit += delta; |
| it.adjustLimit(delta); |
| |
| // We're done if we enter the post context |
| if (it.limit >= allLimit) break; |
| } |
| |
| // Restore limit. pos.start is fine where the last transliterator |
| // left it, or at the end of the last run. |
| pos.limit = allLimit; |
| } |
| |
| Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { |
| |
| if (source == targetScript || source == USCRIPT_INVALID_CODE) { |
| return NULL; |
| } |
| |
| Transliterator* t = NULL; |
| { |
| Mutex m(NULL); |
| t = (Transliterator*) uhash_iget(cache, (int32_t) source); |
| } |
| if (t == NULL) { |
| UErrorCode ec = U_ZERO_ERROR; |
| UnicodeString sourceName(uscript_getShortName(source), -1, US_INV); |
| UnicodeString id(sourceName); |
| id.append(TARGET_SEP).append(target); |
| |
| t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); |
| if (U_FAILURE(ec) || t == NULL) { |
| delete t; |
| |
| // Try to pivot around Latin, our most common script |
| id = sourceName; |
| id.append(LATIN_PIVOT, -1).append(target); |
| t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); |
| if (U_FAILURE(ec) || t == NULL) { |
| delete t; |
| t = NULL; |
| } |
| } |
| |
| if (t != NULL) { |
| Transliterator *rt = NULL; |
| { |
| Mutex m(NULL); |
| rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source)); |
| if (rt == NULL) { |
| // Common case, no race to cache this new transliterator. |
| uhash_iput(cache, (int32_t) source, t, &ec); |
| } else { |
| // Race case, some other thread beat us to caching this transliterator. |
| Transliterator *temp = rt; |
| rt = t; // Our newly created transliterator that lost the race & now needs deleting. |
| t = temp; // The transliterator from the cache that we will return. |
| } |
| } |
| delete rt; // will be non-null only in case of races. |
| } |
| } |
| return t; |
| } |
| |
| /** |
| * Return the script code for a given name, or -1 if not found. |
| */ |
| static UScriptCode scriptNameToCode(const UnicodeString& name) { |
| char buf[128]; |
| UScriptCode code; |
| UErrorCode ec = U_ZERO_ERROR; |
| int32_t nameLen = name.length(); |
| UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); |
| |
| if (isInvariant) { |
| name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); |
| buf[127] = 0; // Make sure that we NULL terminate the string. |
| } |
| if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) |
| { |
| code = USCRIPT_INVALID_CODE; |
| } |
| return code; |
| } |
| |
| /** |
| * Registers standard transliterators with the system. Called by |
| * Transliterator during initialization. Scan all current targets and |
| * register those that are scripts T as Any-T/V. |
| */ |
| void AnyTransliterator::registerIDs() { |
| |
| UErrorCode ec = U_ZERO_ERROR; |
| Hashtable seen(TRUE, ec); |
| |
| int32_t sourceCount = Transliterator::_countAvailableSources(); |
| for (int32_t s=0; s<sourceCount; ++s) { |
| UnicodeString source; |
| Transliterator::_getAvailableSource(s, source); |
| |
| // Ignore the "Any" source |
| if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; |
| |
| int32_t targetCount = Transliterator::_countAvailableTargets(source); |
| for (int32_t t=0; t<targetCount; ++t) { |
| UnicodeString target; |
| Transliterator::_getAvailableTarget(t, source, target); |
| |
| // Only process each target once |
| if (seen.geti(target) != 0) continue; |
| ec = U_ZERO_ERROR; |
| seen.puti(target, 1, ec); |
| |
| // Get the script code for the target. If not a script, ignore. |
| UScriptCode targetScript = scriptNameToCode(target); |
| if (targetScript == USCRIPT_INVALID_CODE) continue; |
| |
| int32_t variantCount = Transliterator::_countAvailableVariants(source, target); |
| // assert(variantCount >= 1); |
| for (int32_t v=0; v<variantCount; ++v) { |
| UnicodeString variant; |
| Transliterator::_getAvailableVariant(v, source, target, variant); |
| |
| UnicodeString id; |
| TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); |
| ec = U_ZERO_ERROR; |
| AnyTransliterator* tl = new AnyTransliterator(id, target, variant, |
| targetScript, ec); |
| if (U_FAILURE(ec)) { |
| delete tl; |
| } else { |
| Transliterator::_registerInstance(tl); |
| Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); |
| } |
| } |
| } |
| } |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| |
| //eof |