blob: 6409a4ea57983220f8df3c8aaf1c49614d8ab1ab [file] [log] [blame]
Victor Chang73229502020-09-17 13:39:19 +01001// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// rbbisetb.h
5/*
6**********************************************************************
7* Copyright (c) 2001-2005, International Business Machines
8* Corporation and others. All Rights Reserved.
9**********************************************************************
10*/
11
12#ifndef RBBISETB_H
13#define RBBISETB_H
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_BREAK_ITERATION
18
Victor Changce4bf3c2021-01-19 16:34:24 +000019#include "unicode/ucptrie.h"
20#include "unicode/umutablecptrie.h"
Victor Chang73229502020-09-17 13:39:19 +010021#include "unicode/uobject.h"
22#include "rbbirb.h"
Victor Chang73229502020-09-17 13:39:19 +010023#include "uvector.h"
24
25U_NAMESPACE_BEGIN
26
27//
28// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
29// from the Unicode Sets appearing in the source RBBI rules, and
30// creates the TRIE table used to map from Unicode to the
31// character categories.
32//
33
34
35//
36// RangeDescriptor
37//
38// Each of the non-overlapping character ranges gets one of these descriptors.
39// All of them are strung together in a linked list, which is kept in order
40// (by character)
41//
42class RangeDescriptor : public UMemory {
43public:
Victor Changce4bf3c2021-01-19 16:34:24 +000044 UChar32 fStartChar {}; // Start of range, unicode 32 bit value.
45 UChar32 fEndChar {}; // End of range, unicode 32 bit value.
46 int32_t fNum {0}; // runtime-mapped input value for this range.
47 bool fIncludesDict {false}; // True if the range includes $dictionary.
48 bool fFirstInGroup {false}; // True if first range in a group with the same fNum.
49 UVector *fIncludesSets {nullptr}; // vector of the the original
50 // Unicode sets that include this range.
51 // (Contains ptrs to uset nodes)
52 RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list.
Victor Chang73229502020-09-17 13:39:19 +010053
54 RangeDescriptor(UErrorCode &status);
55 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
56 ~RangeDescriptor();
57 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
58 // where appearing in the second (higher) part.
Victor Changce4bf3c2021-01-19 16:34:24 +000059 bool isDictionaryRange(); // Check whether this range appears as part of
Victor Chang73229502020-09-17 13:39:19 +010060 // the Unicode set named "dictionary"
61
Victor Changce4bf3c2021-01-19 16:34:24 +000062 RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
63 RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
Victor Chang73229502020-09-17 13:39:19 +010064};
65
66
67//
68// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
69//
70// Starting with the rules parse tree from the scanner,
71//
72// - Enumerate the set of UnicodeSets that are referenced
73// by the RBBI rules.
74// - compute a derived set of non-overlapping UnicodeSets
75// that will correspond to columns in the state table for
76// the RBBI execution engine.
77// - construct the trie table that maps input characters
78// to set numbers in the non-overlapping set of sets.
79//
80
81
82class RBBISetBuilder : public UMemory {
83public:
84 RBBISetBuilder(RBBIRuleBuilder *rb);
85 ~RBBISetBuilder();
86
87 void buildRanges();
88 void buildTrie();
89 void addValToSets(UVector *sets, uint32_t val);
90 void addValToSet (RBBINode *usetNode, uint32_t val);
91 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
92 // runtime state machine, which are the same as
93 // columns in the DFA state table
Victor Changce4bf3c2021-01-19 16:34:24 +000094 int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or
95 // last category + 1 if there are no dictionary categories.
Victor Chang73229502020-09-17 13:39:19 +010096 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
97 void serializeTrie(uint8_t *where); // write out the serialized Trie.
98 UChar32 getFirstChar(int32_t val) const;
99 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
100 // character were encountered.
101 /**
102 * Merge two character categories that have been identified as having equivalent behavior.
103 * The ranges belonging to the second category (table column) will be added to the first.
104 * @param categories the pair of categories to be merged.
105 */
106 void mergeCategories(IntPair categories);
107
Victor Chang73229502020-09-17 13:39:19 +0100108#ifdef RBBI_DEBUG
109 void printSets();
110 void printRanges();
111 void printRangeGroups();
112#else
113 #define printSets()
114 #define printRanges()
115 #define printRangeGroups()
116#endif
117
118private:
Victor Chang73229502020-09-17 13:39:19 +0100119 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
120 UErrorCode *fStatus;
121
122 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
123
Victor Changce4bf3c2021-01-19 16:34:24 +0000124 UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing
125 UCPTrie *fTrie; // the Unicode Sets.
126 uint32_t fTrieSize;
Victor Chang73229502020-09-17 13:39:19 +0100127
Victor Changce4bf3c2021-01-19 16:34:24 +0000128 // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
Victor Chang73229502020-09-17 13:39:19 +0100129 int32_t fGroupCount;
130
Victor Changce4bf3c2021-01-19 16:34:24 +0000131 // The number of the first dictionary char category.
132 // If there are no Dictionary categories, set to the last category + 1.
133 int32_t fDictCategoriesStart;
134
Victor Chang73229502020-09-17 13:39:19 +0100135 UBool fSawBOF;
136
137 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
138 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
139};
140
141
142
143U_NAMESPACE_END
144
145#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
146
147#endif