Blame - icu4c/source/common/rbbi.cpp - platform/external/icu

blob: 2680bf216c789e9f9ee761b7fc939c6b847b6933 [file] [log] [blame]

Fredrik Roubert	64339d3	2016-10-21 19:43:16 +0200	[diff] [blame]	1	// Copyright (C) 2016 and later: Unicode, Inc. and others.
				2	// License & terms of use: http://www.unicode.org/copyright.html
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	3	/*
				4	***************************************************************************
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	5	* Copyright (C) 1999-2016 International Business Machines Corporation
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame]	6	* and others. All rights reserved.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	7	***************************************************************************
				8	*/
				9	//
				10	// file: rbbi.c Contains the implementation of the rule based break iterator
				11	// runtime engine and the API implementation for
				12	// class RuleBasedBreakIterator
				13	//
				14
Craig Cornelius	54dcd9b	2013-02-15 14:03:14 -0800	[diff] [blame]	15	#include "utypeinfo.h" // for 'typeid' to work
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame]	16
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	17	#include "unicode/utypes.h"
				18
				19	#if !UCONFIG_NO_BREAK_ITERATION
				20
				21	#include "unicode/rbbi.h"
				22	#include "unicode/schriter.h"
				23	#include "unicode/uchriter.h"
				24	#include "unicode/udata.h"
				25	#include "unicode/uclean.h"
				26	#include "rbbidata.h"
				27	#include "rbbirb.h"
				28	#include "cmemory.h"
				29	#include "cstring.h"
				30	#include "umutex.h"
				31	#include "ucln_cmn.h"
				32	#include "brkeng.h"
				33
				34	#include "uassert.h"
				35	#include "uvector.h"
				36
				37	// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
				38	#if U_LOCAL_SERVICE_HOOK
				39	#include "localsvc.h"
				40	#endif
				41
				42	#ifdef RBBI_DEBUG
				43	static UBool fTrace = FALSE;
				44	#endif
				45
				46	U_NAMESPACE_BEGIN
				47
Jean-Baptiste Queru	c69afce	2009-07-20 15:02:39 -0700	[diff] [blame]	48	// The state number of the starting state
				49	#define START_STATE 1
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	50
Jean-Baptiste Queru	c69afce	2009-07-20 15:02:39 -0700	[diff] [blame]	51	// The state-transition value indicating "stop"
				52	#define STOP_STATE 0
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	53
				54
				55	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
				56
				57
				58	//=======================================================================
				59	// constructors
				60	//=======================================================================
				61
				62	/**
				63	* Constructs a RuleBasedBreakIterator that uses the already-created
				64	* tables object that is passed in as a parameter.
				65	*/
				66	RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
				67	{
				68	init();
				69	fData = new RBBIDataWrapper(data, status); // status checked in constructor
				70	if (U_FAILURE(status)) {return;}
				71	if(fData == 0) {
				72	status = U_MEMORY_ALLOCATION_ERROR;
				73	return;
				74	}
				75	}
				76
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	77	//
				78	// Construct from precompiled binary rules (tables). This constructor is public API,
				79	// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
				80	//
				81	RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
				82	uint32_t ruleLength,
				83	UErrorCode &status) {
				84	init();
				85	if (U_FAILURE(status)) {
				86	return;
				87	}
				88	if (compiledRules == NULL \|\| ruleLength < sizeof(RBBIDataHeader)) {
				89	status = U_ILLEGAL_ARGUMENT_ERROR;
				90	return;
				91	}
				92	const RBBIDataHeader data = (const RBBIDataHeader )compiledRules;
				93	if (data->fLength > ruleLength) {
				94	status = U_ILLEGAL_ARGUMENT_ERROR;
				95	return;
				96	}
				97	fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
				98	if (U_FAILURE(status)) {return;}
				99	if(fData == 0) {
				100	status = U_MEMORY_ALLOCATION_ERROR;
				101	return;
				102	}
				103	}
				104
				105
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	106	//-------------------------------------------------------------------------------
				107	//
				108	// Constructor from a UDataMemory handle to precompiled break rules
				109	// stored in an ICU data file.
				110	//
				111	//-------------------------------------------------------------------------------
				112	RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
				113	{
				114	init();
				115	fData = new RBBIDataWrapper(udm, status); // status checked in constructor
				116	if (U_FAILURE(status)) {return;}
				117	if(fData == 0) {
				118	status = U_MEMORY_ALLOCATION_ERROR;
				119	return;
				120	}
				121	}
				122
				123
				124
				125	//-------------------------------------------------------------------------------
				126	//
				127	// Constructor from a set of rules supplied as a string.
				128	//
				129	//-------------------------------------------------------------------------------
				130	RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
				131	UParseError &parseError,
				132	UErrorCode &status)
				133	{
				134	init();
				135	if (U_FAILURE(status)) {return;}
				136	RuleBasedBreakIterator bi = (RuleBasedBreakIterator )
Jean-Baptiste Queru	c69afce	2009-07-20 15:02:39 -0700	[diff] [blame]	137	RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	138	// Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
				139	// creates and returns a complete RBBI. From here, in a constructor, we
				140	// can't just return the object created by the builder factory, hence
				141	// the assignment of the factory created object to "this".
				142	if (U_SUCCESS(status)) {
				143	this = bi;
				144	delete bi;
				145	}
				146	}
				147
				148
				149	//-------------------------------------------------------------------------------
				150	//
				151	// Default Constructor. Create an empty shell that can be set up later.
				152	// Used when creating a RuleBasedBreakIterator from a set
				153	// of rules.
				154	//-------------------------------------------------------------------------------
				155	RuleBasedBreakIterator::RuleBasedBreakIterator() {
				156	init();
				157	}
				158
				159
				160	//-------------------------------------------------------------------------------
				161	//
				162	// Copy constructor. Will produce a break iterator with the same behavior,
				163	// and which iterates over the same text, as the one passed in.
				164	//
				165	//-------------------------------------------------------------------------------
				166	RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
				167	: BreakIterator(other)
				168	{
				169	this->init();
				170	*this = other;
				171	}
				172
				173
				174	/**
				175	* Destructor
				176	*/
				177	RuleBasedBreakIterator::~RuleBasedBreakIterator() {
				178	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
				179	// fCharIter was adopted from the outside.
				180	delete fCharIter;
				181	}
				182	fCharIter = NULL;
				183	delete fSCharIter;
				184	fCharIter = NULL;
				185	delete fDCharIter;
				186	fDCharIter = NULL;
				187
				188	utext_close(fText);
				189
				190	if (fData != NULL) {
				191	fData->removeReference();
				192	fData = NULL;
				193	}
				194	if (fCachedBreakPositions) {
				195	uprv_free(fCachedBreakPositions);
				196	fCachedBreakPositions = NULL;
				197	}
				198	if (fLanguageBreakEngines) {
				199	delete fLanguageBreakEngines;
				200	fLanguageBreakEngines = NULL;
				201	}
				202	if (fUnhandledBreakEngine) {
				203	delete fUnhandledBreakEngine;
				204	fUnhandledBreakEngine = NULL;
				205	}
				206	}
				207
				208	/**
				209	* Assignment operator. Sets this iterator to have the same behavior,
				210	* and iterate over the same text, as the one passed in.
				211	*/
				212	RuleBasedBreakIterator&
				213	RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
				214	if (this == &that) {
				215	return *this;
				216	}
				217	reset(); // Delete break cache information
				218	fBreakType = that.fBreakType;
				219	if (fLanguageBreakEngines != NULL) {
				220	delete fLanguageBreakEngines;
				221	fLanguageBreakEngines = NULL; // Just rebuild for now
				222	}
				223	// TODO: clone fLanguageBreakEngines from "that"
				224	UErrorCode status = U_ZERO_ERROR;
				225	fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
				226
				227	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
				228	delete fCharIter;
				229	}
				230	fCharIter = NULL;
				231
				232	if (that.fCharIter != NULL ) {
				233	// This is a little bit tricky - it will intially appear that
				234	// this->fCharIter is adopted, even if that->fCharIter was
				235	// not adopted. That's ok.
				236	fCharIter = that.fCharIter->clone();
				237	}
				238
				239	if (fData != NULL) {
				240	fData->removeReference();
				241	fData = NULL;
				242	}
				243	if (that.fData != NULL) {
				244	fData = that.fData->addReference();
				245	}
				246
				247	return *this;
				248	}
				249
				250
				251
				252	//-----------------------------------------------------------------------------
				253	//
				254	// init() Shared initialization routine. Used by all the constructors.
				255	// Initializes all fields, leaving the object in a consistent state.
				256	//
				257	//-----------------------------------------------------------------------------
				258	void RuleBasedBreakIterator::init() {
				259	UErrorCode status = U_ZERO_ERROR;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	260	fText = utext_openUChars(NULL, NULL, 0, &status);
				261	fCharIter = NULL;
				262	fSCharIter = NULL;
				263	fDCharIter = NULL;
				264	fData = NULL;
				265	fLastRuleStatusIndex = 0;
				266	fLastStatusIndexValid = TRUE;
				267	fDictionaryCharCount = 0;
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	268	fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
				269	// dictionary behavior for Break Iterators that are
				270	// built from rules. Even better would be the ability to
				271	// declare the type in the rules.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	272
				273	fCachedBreakPositions = NULL;
				274	fLanguageBreakEngines = NULL;
				275	fUnhandledBreakEngine = NULL;
				276	fNumCachedBreakPositions = 0;
				277	fPositionInCache = 0;
				278
				279	#ifdef RBBI_DEBUG
				280	static UBool debugInitDone = FALSE;
				281	if (debugInitDone == FALSE) {
				282	char *debugEnv = getenv("U_RBBIDEBUG");
				283	if (debugEnv && uprv_strstr(debugEnv, "trace")) {
				284	fTrace = TRUE;
				285	}
				286	debugInitDone = TRUE;
				287	}
				288	#endif
				289	}
				290
				291
				292
				293	//-----------------------------------------------------------------------------
				294	//
				295	// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
				296	// behavior, and iterating over the same text, as this one.
				297	// Virtual function: does the right thing with subclasses.
				298	//
				299	//-----------------------------------------------------------------------------
				300	BreakIterator*
				301	RuleBasedBreakIterator::clone(void) const {
				302	return new RuleBasedBreakIterator(*this);
				303	}
				304
				305	/**
				306	* Equality operator. Returns TRUE if both BreakIterators are of the
				307	* same class, have the same behavior, and iterate over the same text.
				308	*/
				309	UBool
				310	RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame]	311	if (typeid(*this) != typeid(that)) {
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	312	return FALSE;
				313	}
				314
				315	const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
				316
				317	if (!utext_equals(fText, that2.fText)) {
				318	// The two break iterators are operating on different text,
				319	// or have a different interation position.
				320	return FALSE;
				321	};
				322
				323	// TODO: need a check for when in a dictionary region at different offsets.
				324
				325	if (that2.fData == fData \|\|
				326	(fData != NULL && that2.fData != NULL && that2.fData == fData)) {
				327	// The two break iterators are using the same rules.
				328	return TRUE;
				329	}
				330	return FALSE;
				331	}
				332
				333	/**
				334	* Compute a hash code for this BreakIterator
				335	* @return A hash code
				336	*/
				337	int32_t
				338	RuleBasedBreakIterator::hashCode(void) const {
				339	int32_t hash = 0;
				340	if (fData != NULL) {
				341	hash = fData->hashCode();
				342	}
				343	return hash;
				344	}
				345
				346
				347	void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
				348	if (U_FAILURE(status)) {
				349	return;
				350	}
				351	reset();
				352	fText = utext_clone(fText, ut, FALSE, TRUE, &status);
				353
				354	// Set up a dummy CharacterIterator to be returned if anyone
				355	// calls getText(). With input from UText, there is no reasonable
				356	// way to return a characterIterator over the actual input text.
				357	// Return one over an empty string instead - this is the closest
				358	// we can come to signaling a failure.
				359	// (GetText() is obsolete, this failure is sort of OK)
				360	if (fDCharIter == NULL) {
Jean-Baptiste Queru	c69afce	2009-07-20 15:02:39 -0700	[diff] [blame]	361	static const UChar c = 0;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	362	fDCharIter = new UCharCharacterIterator(&c, 0);
Jean-Baptiste Queru	c69afce	2009-07-20 15:02:39 -0700	[diff] [blame]	363	if (fDCharIter == NULL) {
				364	status = U_MEMORY_ALLOCATION_ERROR;
				365	return;
				366	}
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	367	}
				368
				369	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
				370	// existing fCharIter was adopted from the outside. Delete it now.
				371	delete fCharIter;
				372	}
				373	fCharIter = fDCharIter;
				374
				375	this->first();
				376	}
				377
				378
				379	UText RuleBasedBreakIterator::getUText(UText fillIn, UErrorCode &status) const {
				380	UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
				381	return result;
				382	}
				383
				384
				385
				386	/**
				387	* Returns the description used to create this iterator
				388	*/
				389	const UnicodeString&
				390	RuleBasedBreakIterator::getRules() const {
				391	if (fData != NULL) {
				392	return fData->getRuleSourceString();
				393	} else {
				394	static const UnicodeString *s;
				395	if (s == NULL) {
				396	// TODO: something more elegant here.
				397	// perhaps API should return the string by value.
				398	// Note: thread unsafe init & leak are semi-ok, better than
				399	// what was before. Sould be cleaned up, though.
				400	s = new UnicodeString;
				401	}
				402	return *s;
				403	}
				404	}
				405
				406	//=======================================================================
				407	// BreakIterator overrides
				408	//=======================================================================
				409
				410	/**
				411	* Return a CharacterIterator over the text being analyzed.
				412	*/
				413	CharacterIterator&
				414	RuleBasedBreakIterator::getText() const {
				415	return *fCharIter;
				416	}
				417
				418	/**
				419	* Set the iterator to analyze a new piece of text. This function resets
				420	* the current iteration position to the beginning of the text.
				421	* @param newText An iterator over the text to analyze.
				422	*/
				423	void
				424	RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
				425	// If we are holding a CharacterIterator adopted from a
				426	// previous call to this function, delete it now.
				427	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
				428	delete fCharIter;
				429	}
				430
				431	fCharIter = newText;
				432	UErrorCode status = U_ZERO_ERROR;
				433	reset();
				434	if (newText==NULL \|\| newText->startIndex() != 0) {
				435	// startIndex !=0 wants to be an error, but there's no way to report it.
				436	// Make the iterator text be an empty string.
				437	fText = utext_openUChars(fText, NULL, 0, &status);
				438	} else {
				439	fText = utext_openCharacterIterator(fText, newText, &status);
				440	}
				441	this->first();
				442	}
				443
				444	/**
				445	* Set the iterator to analyze a new piece of text. This function resets
				446	* the current iteration position to the beginning of the text.
				447	* @param newText An iterator over the text to analyze.
				448	*/
				449	void
				450	RuleBasedBreakIterator::setText(const UnicodeString& newText) {
				451	UErrorCode status = U_ZERO_ERROR;
				452	reset();
				453	fText = utext_openConstUnicodeString(fText, &newText, &status);
				454
				455	// Set up a character iterator on the string.
				456	// Needed in case someone calls getText().
				457	// Can not, unfortunately, do this lazily on the (probably never)
				458	// call to getText(), because getText is const.
				459	if (fSCharIter == NULL) {
				460	fSCharIter = new StringCharacterIterator(newText);
				461	} else {
				462	fSCharIter->setText(newText);
				463	}
				464
				465	if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
				466	// old fCharIter was adopted from the outside. Delete it.
				467	delete fCharIter;
				468	}
				469	fCharIter = fSCharIter;
				470
				471	this->first();
				472	}
				473
				474
Elliott Hughes	4fceb0a	2012-10-08 13:16:08 -0700	[diff] [blame]	475	/**
				476	* Provide a new UText for the input text. Must reference text with contents identical
				477	* to the original.
				478	* Intended for use with text data originating in Java (garbage collected) environments
				479	* where the data may be moved in memory at arbitrary times.
				480	*/
				481	RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
				482	if (U_FAILURE(status)) {
				483	return *this;
				484	}
				485	if (input == NULL) {
				486	status = U_ILLEGAL_ARGUMENT_ERROR;
				487	return *this;
				488	}
				489	int64_t pos = utext_getNativeIndex(fText);
				490	// Shallow read-only clone of the new UText into the existing input UText
				491	fText = utext_clone(fText, input, FALSE, TRUE, &status);
				492	if (U_FAILURE(status)) {
				493	return *this;
				494	}
				495	utext_setNativeIndex(fText, pos);
				496	if (utext_getNativeIndex(fText) != pos) {
				497	// Sanity check. The new input utext is supposed to have the exact same
				498	// contents as the old. If we can't set to the same position, it doesn't.
				499	// The contents underlying the old utext might be invalid at this point,
				500	// so it's not safe to check directly.
				501	status = U_ILLEGAL_ARGUMENT_ERROR;
				502	}
				503	return *this;
				504	}
				505
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	506
				507	/**
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	508	* Sets the current iteration position to the beginning of the text, position zero.
				509	* @return The new iterator position, which is zero.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	510	*/
				511	int32_t RuleBasedBreakIterator::first(void) {
				512	reset();
				513	fLastRuleStatusIndex = 0;
				514	fLastStatusIndexValid = TRUE;
				515	//if (fText == NULL)
				516	// return BreakIterator::DONE;
				517
				518	utext_setNativeIndex(fText, 0);
				519	return 0;
				520	}
				521
				522	/**
				523	* Sets the current iteration position to the end of the text.
				524	* @return The text's past-the-end offset.
				525	*/
				526	int32_t RuleBasedBreakIterator::last(void) {
				527	reset();
				528	if (fText == NULL) {
				529	fLastRuleStatusIndex = 0;
				530	fLastStatusIndexValid = TRUE;
				531	return BreakIterator::DONE;
				532	}
				533
				534	fLastStatusIndexValid = FALSE;
				535	int32_t pos = (int32_t)utext_nativeLength(fText);
				536	utext_setNativeIndex(fText, pos);
				537	return pos;
				538	}
				539
				540	/**
				541	* Advances the iterator either forward or backward the specified number of steps.
				542	* Negative values move backward, and positive values move forward. This is
				543	* equivalent to repeatedly calling next() or previous().
				544	* @param n The number of steps to move. The sign indicates the direction
				545	* (negative is backwards, and positive is forwards).
				546	* @return The character offset of the boundary position n boundaries away from
				547	* the current one.
				548	*/
				549	int32_t RuleBasedBreakIterator::next(int32_t n) {
				550	int32_t result = current();
				551	while (n > 0) {
				552	result = next();
				553	--n;
				554	}
				555	while (n < 0) {
				556	result = previous();
				557	++n;
				558	}
				559	return result;
				560	}
				561
				562	/**
				563	* Advances the iterator to the next boundary position.
				564	* @return The position of the first boundary after this one.
				565	*/
				566	int32_t RuleBasedBreakIterator::next(void) {
				567	// if we have cached break positions and we're still in the range
				568	// covered by them, just move one step forward in the cache
				569	if (fCachedBreakPositions != NULL) {
				570	if (fPositionInCache < fNumCachedBreakPositions - 1) {
				571	++fPositionInCache;
				572	int32_t pos = fCachedBreakPositions[fPositionInCache];
				573	utext_setNativeIndex(fText, pos);
				574	return pos;
				575	}
				576	else {
				577	reset();
				578	}
				579	}
				580
				581	int32_t startPos = current();
ccornelius	fceb398	2014-04-16 12:27:14 -0700	[diff] [blame]	582	fDictionaryCharCount = 0;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	583	int32_t result = handleNext(fData->fForwardTable);
				584	if (fDictionaryCharCount > 0) {
				585	result = checkDictionary(startPos, result, FALSE);
				586	}
				587	return result;
				588	}
				589
				590	/**
				591	* Advances the iterator backwards, to the last boundary preceding this one.
				592	* @return The position of the last boundary position preceding this one.
				593	*/
				594	int32_t RuleBasedBreakIterator::previous(void) {
				595	int32_t result;
				596	int32_t startPos;
				597
				598	// if we have cached break positions and we're still in the range
				599	// covered by them, just move one step backward in the cache
				600	if (fCachedBreakPositions != NULL) {
				601	if (fPositionInCache > 0) {
				602	--fPositionInCache;
				603	// If we're at the beginning of the cache, need to reevaluate the
				604	// rule status
				605	if (fPositionInCache <= 0) {
				606	fLastStatusIndexValid = FALSE;
				607	}
				608	int32_t pos = fCachedBreakPositions[fPositionInCache];
				609	utext_setNativeIndex(fText, pos);
				610	return pos;
				611	}
				612	else {
				613	reset();
				614	}
				615	}
				616
				617	// if we're already sitting at the beginning of the text, return DONE
				618	if (fText == NULL \|\| (startPos = current()) == 0) {
				619	fLastRuleStatusIndex = 0;
				620	fLastStatusIndexValid = TRUE;
				621	return BreakIterator::DONE;
				622	}
				623
				624	if (fData->fSafeRevTable != NULL \|\| fData->fSafeFwdTable != NULL) {
				625	result = handlePrevious(fData->fReverseTable);
				626	if (fDictionaryCharCount > 0) {
				627	result = checkDictionary(result, startPos, TRUE);
				628	}
				629	return result;
				630	}
				631
				632	// old rule syntax
				633	// set things up. handlePrevious() will back us up to some valid
				634	// break position before the current position (we back our internal
				635	// iterator up one step to prevent handlePrevious() from returning
				636	// the current position), but not necessarily the last one before
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	637	// where we started
				638
				639	int32_t start = current();
				640
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	641	(void)UTEXT_PREVIOUS32(fText);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	642	int32_t lastResult = handlePrevious(fData->fReverseTable);
				643	if (lastResult == UBRK_DONE) {
				644	lastResult = 0;
				645	utext_setNativeIndex(fText, 0);
				646	}
				647	result = lastResult;
				648	int32_t lastTag = 0;
				649	UBool breakTagValid = FALSE;
				650
				651	// iterate forward from the known break position until we pass our
				652	// starting point. The last break position before the starting
				653	// point is our return value
				654
				655	for (;;) {
				656	result = next();
				657	if (result == BreakIterator::DONE \|\| result >= start) {
				658	break;
				659	}
				660	lastResult = result;
				661	lastTag = fLastRuleStatusIndex;
				662	breakTagValid = TRUE;
				663	}
				664
				665	// fLastBreakTag wants to have the value for section of text preceding
				666	// the result position that we are to return (in lastResult.) If
				667	// the backwards rules overshot and the above loop had to do two or more
				668	// next()s to move up to the desired return position, we will have a valid
ccornelius	fceb398	2014-04-16 12:27:14 -0700	[diff] [blame]	669	// tag value. But, if handlePrevious() took us to exactly the correct result position,
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	670	// we wont have a tag value for that position, which is only set by handleNext().
				671
ccornelius	fceb398	2014-04-16 12:27:14 -0700	[diff] [blame]	672	// Set the current iteration position to be the last break position
				673	// before where we started, and then return that value.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	674	utext_setNativeIndex(fText, lastResult);
				675	fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
				676	fLastStatusIndexValid = breakTagValid;
				677
				678	// No need to check the dictionary; it will have been handled by
				679	// next()
				680
				681	return lastResult;
				682	}
				683
				684	/**
				685	* Sets the iterator to refer to the first boundary position following
				686	* the specified position.
				687	* @offset The position from which to begin searching for a break position.
				688	* @return The position of the first break after the current position.
				689	*/
				690	int32_t RuleBasedBreakIterator::following(int32_t offset) {
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	691	// if the offset passed in is already past the end of the text,
				692	// just return DONE; if it's before the beginning, return the
				693	// text's starting offset
				694	if (fText == NULL \|\| offset >= utext_nativeLength(fText)) {
				695	last();
				696	return next();
				697	}
				698	else if (offset < 0) {
				699	return first();
				700	}
				701
				702	// Move requested offset to a code point start. It might be on a trail surrogate,
				703	// or on a trail byte if the input is UTF-8.
				704	utext_setNativeIndex(fText, offset);
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	705	offset = (int32_t)utext_getNativeIndex(fText);
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	706
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	707	// if we have cached break positions and offset is in the range
				708	// covered by them, use them
				709	// TODO: could use binary search
				710	// TODO: what if offset is outside range, but break is not?
				711	if (fCachedBreakPositions != NULL) {
				712	if (offset >= fCachedBreakPositions[0]
				713	&& offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
				714	fPositionInCache = 0;
				715	// We are guaranteed not to leave the array due to range test above
				716	while (offset >= fCachedBreakPositions[fPositionInCache]) {
				717	++fPositionInCache;
				718	}
				719	int32_t pos = fCachedBreakPositions[fPositionInCache];
				720	utext_setNativeIndex(fText, pos);
				721	return pos;
				722	}
				723	else {
				724	reset();
				725	}
				726	}
				727
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	728	// Set our internal iteration position (temporarily)
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	729	// to the position passed in. If this is the _beginning_ position,
				730	// then we can just use next() to get our return value
				731
				732	int32_t result = 0;
				733
				734	if (fData->fSafeRevTable != NULL) {
				735	// new rule syntax
				736	utext_setNativeIndex(fText, offset);
				737	// move forward one codepoint to prepare for moving back to a
				738	// safe point.
				739	// this handles offset being between a supplementary character
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	740	// TODO: is this still needed, with move to code point boundary handled above?
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	741	(void)UTEXT_NEXT32(fText);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	742	// handlePrevious will move most of the time to < 1 boundary away
				743	handlePrevious(fData->fSafeRevTable);
				744	int32_t result = next();
				745	while (result <= offset) {
				746	result = next();
				747	}
				748	return result;
				749	}
				750	if (fData->fSafeFwdTable != NULL) {
				751	// backup plan if forward safe table is not available
				752	utext_setNativeIndex(fText, offset);
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	753	(void)UTEXT_PREVIOUS32(fText);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	754	// handle next will give result >= offset
				755	handleNext(fData->fSafeFwdTable);
				756	// previous will give result 0 or 1 boundary away from offset,
				757	// most of the time
				758	// we have to
				759	int32_t oldresult = previous();
				760	while (oldresult > offset) {
				761	int32_t result = previous();
				762	if (result <= offset) {
				763	return oldresult;
				764	}
				765	oldresult = result;
				766	}
				767	int32_t result = next();
				768	if (result <= offset) {
				769	return next();
				770	}
				771	return result;
				772	}
				773	// otherwise, we have to sync up first. Use handlePrevious() to back
				774	// up to a known break position before the specified position (if
				775	// we can determine that the specified position is a break position,
				776	// we don't back up at all). This may or may not be the last break
				777	// position at or before our starting position. Advance forward
				778	// from here until we've passed the starting position. The position
				779	// we stop on will be the first break position after the specified one.
				780	// old rule syntax
				781
				782	utext_setNativeIndex(fText, offset);
				783	if (offset==0 \|\|
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame]	784	(offset==1 && utext_getNativeIndex(fText)==0)) {
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	785	return next();
				786	}
				787	result = previous();
				788
				789	while (result != BreakIterator::DONE && result <= offset) {
				790	result = next();
				791	}
				792
				793	return result;
				794	}
				795
				796	/**
				797	* Sets the iterator to refer to the last boundary position before the
				798	* specified position.
				799	* @offset The position to begin searching for a break from.
				800	* @return The position of the last boundary before the starting position.
				801	*/
				802	int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	803	// if the offset passed in is already past the end of the text,
				804	// just return DONE; if it's before the beginning, return the
				805	// text's starting offset
				806	if (fText == NULL \|\| offset > utext_nativeLength(fText)) {
				807	return last();
				808	}
				809	else if (offset < 0) {
				810	return first();
				811	}
				812
				813	// Move requested offset to a code point start. It might be on a trail surrogate,
				814	// or on a trail byte if the input is UTF-8.
				815	utext_setNativeIndex(fText, offset);
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	816	offset = (int32_t)utext_getNativeIndex(fText);
ccornelius	f9878a2	2014-11-20 18:09:39 -0800	[diff] [blame]	817
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	818	// if we have cached break positions and offset is in the range
				819	// covered by them, use them
				820	if (fCachedBreakPositions != NULL) {
				821	// TODO: binary search?
				822	// TODO: What if offset is outside range, but break is not?
				823	if (offset > fCachedBreakPositions[0]
				824	&& offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
				825	fPositionInCache = 0;
				826	while (fPositionInCache < fNumCachedBreakPositions
				827	&& offset > fCachedBreakPositions[fPositionInCache])
				828	++fPositionInCache;
				829	--fPositionInCache;
				830	// If we're at the beginning of the cache, need to reevaluate the
				831	// rule status
				832	if (fPositionInCache <= 0) {
				833	fLastStatusIndexValid = FALSE;
				834	}
				835	utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
				836	return fCachedBreakPositions[fPositionInCache];
				837	}
				838	else {
				839	reset();
				840	}
				841	}
				842
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	843	// if we start by updating the current iteration position to the
				844	// position specified by the caller, we can just use previous()
				845	// to carry out this operation
				846
				847	if (fData->fSafeFwdTable != NULL) {
				848	// new rule syntax
				849	utext_setNativeIndex(fText, offset);
				850	int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				851	if (newOffset != offset) {
				852	// Will come here if specified offset was not a code point boundary AND
				853	// the underlying implmentation is using UText, which snaps any non-code-point-boundary
				854	// indices to the containing code point.
				855	// For breakitereator::preceding only, these non-code-point indices need to be moved
				856	// up to refer to the following codepoint.
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	857	(void)UTEXT_NEXT32(fText);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	858	offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				859	}
				860
				861	// TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
				862	// rather than adjusting the position unconditionally?
				863	// (Change would interact with safe rules.)
				864	// TODO: change RBBI behavior for off-boundary indices to match that of UText?
				865	// affects only preceding(), seems cleaner, but is slightly different.
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	866	(void)UTEXT_PREVIOUS32(fText);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	867	handleNext(fData->fSafeFwdTable);
				868	int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				869	while (result >= offset) {
				870	result = previous();
				871	}
				872	return result;
				873	}
				874	if (fData->fSafeRevTable != NULL) {
				875	// backup plan if forward safe table is not available
				876	// TODO: check whether this path can be discarded
				877	// It's probably OK to say that rules must supply both safe tables
				878	// if they use safe tables at all. We have certainly never described
				879	// to anyone how to work with just one safe table.
				880	utext_setNativeIndex(fText, offset);
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	881	(void)UTEXT_NEXT32(fText);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	882
				883	// handle previous will give result <= offset
				884	handlePrevious(fData->fSafeRevTable);
				885
				886	// next will give result 0 or 1 boundary away from offset,
				887	// most of the time
				888	// we have to
				889	int32_t oldresult = next();
				890	while (oldresult < offset) {
				891	int32_t result = next();
				892	if (result >= offset) {
				893	return oldresult;
				894	}
				895	oldresult = result;
				896	}
				897	int32_t result = previous();
				898	if (result >= offset) {
				899	return previous();
				900	}
				901	return result;
				902	}
				903
				904	// old rule syntax
				905	utext_setNativeIndex(fText, offset);
				906	return previous();
				907	}
				908
				909	/**
				910	* Returns true if the specfied position is a boundary position. As a side
				911	* effect, leaves the iterator pointing to the first boundary position at
				912	* or after "offset".
				913	* @param offset the offset to check.
				914	* @return True if "offset" is a boundary position.
				915	*/
				916	UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
				917	// the beginning index of the iterator is always a boundary position by definition
				918	if (offset == 0) {
				919	first(); // For side effects on current position, tag values.
				920	return TRUE;
				921	}
				922
				923	if (offset == (int32_t)utext_nativeLength(fText)) {
				924	last(); // For side effects on current position, tag values.
				925	return TRUE;
				926	}
				927
				928	// out-of-range indexes are never boundary positions
				929	if (offset < 0) {
				930	first(); // For side effects on current position, tag values.
				931	return FALSE;
				932	}
				933
				934	if (offset > utext_nativeLength(fText)) {
				935	last(); // For side effects on current position, tag values.
				936	return FALSE;
				937	}
				938
				939	// otherwise, we can use following() on the position before the specified
				940	// one and return true if the position we get back is the one the user
				941	// specified
				942	utext_previous32From(fText, offset);
				943	int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				944	UBool result = following(backOne) == offset;
				945	return result;
				946	}
				947
				948	/**
				949	* Returns the current iteration position.
				950	* @return The current iteration position.
				951	*/
				952	int32_t RuleBasedBreakIterator::current(void) const {
				953	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				954	return pos;
				955	}
				956
				957	//=======================================================================
				958	// implementation
				959	//=======================================================================
				960
				961	//
				962	// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
				963	// of user text. A variable with this enum type keeps track of where we
				964	// are. The state machine only fetches user input while in the RUN mode.
				965	//
				966	enum RBBIRunMode {
				967	RBBI_START, // state machine processing is before first char of input
				968	RBBI_RUN, // state machine processing is in the user text
				969	RBBI_END // state machine processing is after end of user text.
				970	};
				971
				972
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	973	// Map from look-ahead break states (corresponds to rules) to boundary positions.
				974	// Allows multiple lookahead break rules to be in flight at the same time.
				975	//
				976	// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
				977	// in the state table be sequential, then we can just index an array. And the
				978	// table could also tell us in advance how big that array needs to be.
				979	//
				980	// Before ICU 57 there was just a single simple variable for a look-ahead match that
				981	// was in progress. Two rules at once did not work.
				982
				983	static const int32_t kMaxLookaheads = 8;
				984	struct LookAheadResults {
				985	int32_t fUsedSlotLimit;
				986	int32_t fPositions[8];
				987	int16_t fKeys[8];
				988
				989	LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
				990
				991	int32_t getPosition(int16_t key) {
				992	for (int32_t i=0; i<fUsedSlotLimit; ++i) {
				993	if (fKeys[i] == key) {
				994	return fPositions[i];
				995	}
				996	}
				997	U_ASSERT(FALSE);
				998	return -1;
				999	}
				1000
				1001	void setPosition(int16_t key, int32_t position) {
				1002	int32_t i;
				1003	for (i=0; i<fUsedSlotLimit; ++i) {
				1004	if (fKeys[i] == key) {
				1005	fPositions[i] = position;
				1006	return;
				1007	}
				1008	}
				1009	if (i >= kMaxLookaheads) {
				1010	U_ASSERT(FALSE);
				1011	i = kMaxLookaheads - 1;
				1012	}
				1013	fKeys[i] = key;
				1014	fPositions[i] = position;
				1015	U_ASSERT(fUsedSlotLimit == i);
				1016	fUsedSlotLimit = i + 1;
				1017	}
				1018	};
				1019
				1020
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1021	//-----------------------------------------------------------------------------------
				1022	//
				1023	// handleNext(stateTable)
				1024	// This method is the actual implementation of the rbbi next() method.
				1025	// This method initializes the state machine to state 1
				1026	// and advances through the text character by character until we reach the end
				1027	// of the text or the state machine transitions to state 0. We update our return
				1028	// value every time the state machine passes through an accepting state.
				1029	//
				1030	//-----------------------------------------------------------------------------------
				1031	int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
				1032	int32_t state;
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1033	uint16_t category = 0;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1034	RBBIRunMode mode;
				1035
				1036	RBBIStateTableRow *row;
				1037	UChar32 c;
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1038	LookAheadResults lookAheadMatches;
				1039	int32_t result = 0;
				1040	int32_t initialPosition = 0;
				1041	const char *tableData = statetable->fTableData;
				1042	uint32_t tableRowLen = statetable->fRowLen;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1043
				1044	#ifdef RBBI_DEBUG
				1045	if (fTrace) {
				1046	RBBIDebugPuts("Handle Next pos char state category");
				1047	}
				1048	#endif
				1049
				1050	// No matter what, handleNext alway correctly sets the break tag value.
				1051	fLastStatusIndexValid = TRUE;
				1052	fLastRuleStatusIndex = 0;
				1053
				1054	// if we're already at the end of the text, return DONE.
				1055	initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1056	result = initialPosition;
				1057	c = UTEXT_NEXT32(fText);
				1058	if (fData == NULL \|\| c==U_SENTINEL) {
				1059	return BreakIterator::DONE;
				1060	}
				1061
				1062	// Set the initial state for the state machine
				1063	state = START_STATE;
				1064	row = (RBBIStateTableRow *)
				1065	//(statetable->fTableData + (statetable->fRowLen * state));
				1066	(tableData + tableRowLen * state);
				1067
				1068
				1069	mode = RBBI_RUN;
				1070	if (statetable->fFlags & RBBI_BOF_REQUIRED) {
				1071	category = 2;
				1072	mode = RBBI_START;
				1073	}
				1074
				1075
				1076	// loop until we reach the end of the text or transition to state 0
				1077	//
				1078	for (;;) {
				1079	if (c == U_SENTINEL) {
				1080	// Reached end of input string.
				1081	if (mode == RBBI_END) {
				1082	// We have already run the loop one last time with the
				1083	// character set to the psueudo {eof} value. Now it is time
				1084	// to unconditionally bail out.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1085	break;
				1086	}
				1087	// Run the loop one last time with the fake end-of-input character category.
				1088	mode = RBBI_END;
				1089	category = 1;
				1090	}
				1091
				1092	//
				1093	// Get the char category. An incoming category of 1 or 2 means that
				1094	// we are preset for doing the beginning or end of input, and
				1095	// that we shouldn't get a category from an actual text input character.
				1096	//
				1097	if (mode == RBBI_RUN) {
				1098	// look up the current character's character category, which tells us
				1099	// which column in the state table to look at.
				1100	// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
				1101	// not the size of the character going in, which is a UChar32.
				1102	//
				1103	UTRIE_GET16(&fData->fTrie, c, category);
				1104
				1105	// Check the dictionary bit in the character's category.
				1106	// Counter is only used by dictionary based iterators (subclasses).
				1107	// Chars that need to be handled by a dictionary have a flag bit set
				1108	// in their category values.
				1109	//
				1110	if ((category & 0x4000) != 0) {
				1111	fDictionaryCharCount++;
				1112	// And off the dictionary flag bit.
				1113	category &= ~0x4000;
				1114	}
				1115	}
				1116
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1117	#ifdef RBBI_DEBUG
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1118	if (fTrace) {
claireho	50294ea	2010-05-03 15:44:48 -0700	[diff] [blame]	1119	RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText));
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1120	if (0x20<=c && c<0x7f) {
				1121	RBBIDebugPrintf("\"%c\" ", c);
				1122	} else {
				1123	RBBIDebugPrintf("%5x ", c);
				1124	}
				1125	RBBIDebugPrintf("%3d %3d\n", state, category);
				1126	}
				1127	#endif
				1128
				1129	// State Transition - move machine to its next state
				1130	//
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1131
				1132	// Note: fNextState is defined as uint16_t[2], but we are casting
				1133	// a generated RBBI table to RBBIStateTableRow and some tables
				1134	// actually have more than 2 categories.
				1135	U_ASSERT(category<fData->fHeader->fCatCount);
				1136	state = row->fNextState[category]; /Not accessing beyond memory/
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1137	row = (RBBIStateTableRow *)
				1138	// (statetable->fTableData + (statetable->fRowLen * state));
				1139	(tableData + tableRowLen * state);
				1140
				1141
				1142	if (row->fAccepting == -1) {
				1143	// Match found, common case.
				1144	if (mode != RBBI_START) {
				1145	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1146	}
				1147	fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
				1148	}
				1149
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1150	int16_t completedRule = row->fAccepting;
				1151	if (completedRule > 0) {
				1152	// Lookahead match is completed.
				1153	int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
				1154	if (lookaheadResult >= 0) {
				1155	fLastRuleStatusIndex = row->fTagIdx;
				1156	UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
				1157	return lookaheadResult;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1158	}
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1159	}
				1160	int16_t rule = row->fLookAhead;
				1161	if (rule != 0) {
				1162	// At the position of a '/' in a look-ahead match. Record it.
				1163	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1164	lookAheadMatches.setPosition(rule, pos);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1165	}
				1166
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1167	if (state == STOP_STATE) {
				1168	// This is the normal exit from the lookup state machine.
				1169	// We have advanced through the string until it is certain that no
				1170	// longer match is possible, no matter what characters follow.
				1171	break;
				1172	}
				1173
				1174	// Advance to the next character.
				1175	// If this is a beginning-of-input loop iteration, don't advance
				1176	// the input position. The next iteration will be processing the
				1177	// first real input character.
				1178	if (mode == RBBI_RUN) {
				1179	c = UTEXT_NEXT32(fText);
				1180	} else {
				1181	if (mode == RBBI_START) {
				1182	mode = RBBI_RUN;
				1183	}
				1184	}
				1185
				1186
				1187	}
				1188
				1189	// The state machine is done. Check whether it found a match...
				1190
				1191	// If the iterator failed to advance in the match engine, force it ahead by one.
				1192	// (This really indicates a defect in the break rules. They should always match
				1193	// at least one character.)
				1194	if (result == initialPosition) {
				1195	UTEXT_SETNATIVEINDEX(fText, initialPosition);
				1196	UTEXT_NEXT32(fText);
				1197	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1198	}
				1199
				1200	// Leave the iterator at our result position.
				1201	UTEXT_SETNATIVEINDEX(fText, result);
				1202	#ifdef RBBI_DEBUG
				1203	if (fTrace) {
				1204	RBBIDebugPrintf("result = %d\n\n", result);
				1205	}
				1206	#endif
				1207	return result;
				1208	}
				1209
				1210
				1211
				1212	//-----------------------------------------------------------------------------------
				1213	//
				1214	// handlePrevious()
				1215	//
				1216	// Iterate backwards, according to the logic of the reverse rules.
				1217	// This version handles the exact style backwards rules.
				1218	//
				1219	// The logic of this function is very similar to handleNext(), above.
				1220	//
				1221	//-----------------------------------------------------------------------------------
				1222	int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
				1223	int32_t state;
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1224	uint16_t category = 0;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1225	RBBIRunMode mode;
				1226	RBBIStateTableRow *row;
				1227	UChar32 c;
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1228	LookAheadResults lookAheadMatches;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1229	int32_t result = 0;
				1230	int32_t initialPosition = 0;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1231
				1232	#ifdef RBBI_DEBUG
				1233	if (fTrace) {
				1234	RBBIDebugPuts("Handle Previous pos char state category");
				1235	}
				1236	#endif
				1237
				1238	// handlePrevious() never gets the rule status.
				1239	// Flag the status as invalid; if the user ever asks for status, we will need
				1240	// to back up, then re-find the break position using handleNext(), which does
				1241	// get the status value.
				1242	fLastStatusIndexValid = FALSE;
				1243	fLastRuleStatusIndex = 0;
				1244
				1245	// if we're already at the start of the text, return DONE.
				1246	if (fText == NULL \|\| fData == NULL \|\| UTEXT_GETNATIVEINDEX(fText)==0) {
				1247	return BreakIterator::DONE;
				1248	}
				1249
				1250	// Set up the starting char.
				1251	initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1252	result = initialPosition;
				1253	c = UTEXT_PREVIOUS32(fText);
				1254
				1255	// Set the initial state for the state machine
				1256	state = START_STATE;
				1257	row = (RBBIStateTableRow *)
				1258	(statetable->fTableData + (statetable->fRowLen * state));
				1259	category = 3;
				1260	mode = RBBI_RUN;
				1261	if (statetable->fFlags & RBBI_BOF_REQUIRED) {
				1262	category = 2;
				1263	mode = RBBI_START;
				1264	}
				1265
				1266
				1267	// loop until we reach the start of the text or transition to state 0
				1268	//
				1269	for (;;) {
				1270	if (c == U_SENTINEL) {
				1271	// Reached end of input string.
claireho	27f6547	2011-06-09 11:11:49 -0700	[diff] [blame]	1272	if (mode == RBBI_END) {
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1273	// We have already run the loop one last time with the
				1274	// character set to the psueudo {eof} value. Now it is time
				1275	// to unconditionally bail out.
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1276	if (result == initialPosition) {
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1277	// Ran off start, no match found.
				1278	// move one index one (towards the start, since we are doing a previous())
				1279	UTEXT_SETNATIVEINDEX(fText, initialPosition);
claireho	b26ce3a	2012-01-10 17:54:41 -0800	[diff] [blame]	1280	(void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1281	}
				1282	break;
				1283	}
				1284	// Run the loop one last time with the fake end-of-input character category.
				1285	mode = RBBI_END;
				1286	category = 1;
				1287	}
				1288
				1289	//
				1290	// Get the char category. An incoming category of 1 or 2 means that
				1291	// we are preset for doing the beginning or end of input, and
				1292	// that we shouldn't get a category from an actual text input character.
				1293	//
				1294	if (mode == RBBI_RUN) {
				1295	// look up the current character's character category, which tells us
				1296	// which column in the state table to look at.
				1297	// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
				1298	// not the size of the character going in, which is a UChar32.
				1299	//
				1300	UTRIE_GET16(&fData->fTrie, c, category);
				1301
				1302	// Check the dictionary bit in the character's category.
				1303	// Counter is only used by dictionary based iterators (subclasses).
				1304	// Chars that need to be handled by a dictionary have a flag bit set
				1305	// in their category values.
				1306	//
				1307	if ((category & 0x4000) != 0) {
				1308	fDictionaryCharCount++;
				1309	// And off the dictionary flag bit.
				1310	category &= ~0x4000;
				1311	}
				1312	}
				1313
				1314	#ifdef RBBI_DEBUG
				1315	if (fTrace) {
				1316	RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
				1317	if (0x20<=c && c<0x7f) {
				1318	RBBIDebugPrintf("\"%c\" ", c);
				1319	} else {
				1320	RBBIDebugPrintf("%5x ", c);
				1321	}
				1322	RBBIDebugPrintf("%3d %3d\n", state, category);
				1323	}
				1324	#endif
				1325
				1326	// State Transition - move machine to its next state
				1327	//
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1328
				1329	// Note: fNextState is defined as uint16_t[2], but we are casting
				1330	// a generated RBBI table to RBBIStateTableRow and some tables
				1331	// actually have more than 2 categories.
				1332	U_ASSERT(category<fData->fHeader->fCatCount);
				1333	state = row->fNextState[category]; /Not accessing beyond memory/
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1334	row = (RBBIStateTableRow *)
				1335	(statetable->fTableData + (statetable->fRowLen * state));
				1336
				1337	if (row->fAccepting == -1) {
				1338	// Match found, common case.
				1339	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1340	}
				1341
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1342	int16_t completedRule = row->fAccepting;
				1343	if (completedRule > 0) {
				1344	// Lookahead match is completed.
				1345	int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
				1346	if (lookaheadResult >= 0) {
				1347	UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
				1348	return lookaheadResult;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1349	}
Fredrik Roubert	8de051c	2016-03-10 13:13:27 +0100	[diff] [blame]	1350	}
				1351	int16_t rule = row->fLookAhead;
				1352	if (rule != 0) {
				1353	// At the position of a '/' in a look-ahead match. Record it.
				1354	int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1355	lookAheadMatches.setPosition(rule, pos);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1356	}
				1357
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1358	if (state == STOP_STATE) {
				1359	// This is the normal exit from the lookup state machine.
				1360	// We have advanced through the string until it is certain that no
				1361	// longer match is possible, no matter what characters follow.
				1362	break;
				1363	}
				1364
				1365	// Move (backwards) to the next character to process.
				1366	// If this is a beginning-of-input loop iteration, don't advance
				1367	// the input position. The next iteration will be processing the
				1368	// first real input character.
				1369	if (mode == RBBI_RUN) {
				1370	c = UTEXT_PREVIOUS32(fText);
				1371	} else {
				1372	if (mode == RBBI_START) {
				1373	mode = RBBI_RUN;
				1374	}
				1375	}
				1376	}
				1377
				1378	// The state machine is done. Check whether it found a match...
				1379
				1380	// If the iterator failed to advance in the match engine, force it ahead by one.
				1381	// (This really indicates a defect in the break rules. They should always match
				1382	// at least one character.)
				1383	if (result == initialPosition) {
				1384	UTEXT_SETNATIVEINDEX(fText, initialPosition);
				1385	UTEXT_PREVIOUS32(fText);
				1386	result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1387	}
				1388
				1389	// Leave the iterator at our result position.
				1390	UTEXT_SETNATIVEINDEX(fText, result);
				1391	#ifdef RBBI_DEBUG
				1392	if (fTrace) {
				1393	RBBIDebugPrintf("result = %d\n\n", result);
				1394	}
				1395	#endif
				1396	return result;
				1397	}
				1398
				1399
				1400	void
				1401	RuleBasedBreakIterator::reset()
				1402	{
				1403	if (fCachedBreakPositions) {
				1404	uprv_free(fCachedBreakPositions);
				1405	}
				1406	fCachedBreakPositions = NULL;
				1407	fNumCachedBreakPositions = 0;
				1408	fDictionaryCharCount = 0;
				1409	fPositionInCache = 0;
				1410	}
				1411
				1412
				1413
				1414	//-------------------------------------------------------------------------------
				1415	//
				1416	// getRuleStatus() Return the break rule tag associated with the current
				1417	// iterator position. If the iterator arrived at its current
				1418	// position by iterating forwards, the value will have been
				1419	// cached by the handleNext() function.
				1420	//
				1421	// If no cached status value is available, the status is
				1422	// found by doing a previous() followed by a next(), which
				1423	// leaves the iterator where it started, and computes the
				1424	// status while doing the next().
				1425	//
				1426	//-------------------------------------------------------------------------------
				1427	void RuleBasedBreakIterator::makeRuleStatusValid() {
				1428	if (fLastStatusIndexValid == FALSE) {
				1429	// No cached status is available.
				1430	if (fText == NULL \|\| current() == 0) {
				1431	// At start of text, or there is no text. Status is always zero.
				1432	fLastRuleStatusIndex = 0;
				1433	fLastStatusIndexValid = TRUE;
				1434	} else {
				1435	// Not at start of text. Find status the tedious way.
				1436	int32_t pa = current();
				1437	previous();
				1438	if (fNumCachedBreakPositions > 0) {
				1439	reset(); // Blow off the dictionary cache
				1440	}
				1441	int32_t pb = next();
				1442	if (pa != pb) {
				1443	// note: the if (pa != pb) test is here only to eliminate warnings for
				1444	// unused local variables on gcc. Logically, it isn't needed.
				1445	U_ASSERT(pa == pb);
				1446	}
				1447	}
				1448	}
				1449	U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
				1450	}
				1451
				1452
				1453	int32_t RuleBasedBreakIterator::getRuleStatus() const {
				1454	RuleBasedBreakIterator nonConstThis = (RuleBasedBreakIterator )this;
				1455	nonConstThis->makeRuleStatusValid();
				1456
				1457	// fLastRuleStatusIndex indexes to the start of the appropriate status record
				1458	// (the number of status values.)
				1459	// This function returns the last (largest) of the array of status values.
				1460	int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
				1461	int32_t tagVal = fData->fRuleStatusTable[idx];
				1462
				1463	return tagVal;
				1464	}
				1465
				1466
				1467
				1468
				1469	int32_t RuleBasedBreakIterator::getRuleStatusVec(
				1470	int32_t *fillInVec, int32_t capacity, UErrorCode &status)
				1471	{
				1472	if (U_FAILURE(status)) {
				1473	return 0;
				1474	}
				1475
				1476	RuleBasedBreakIterator nonConstThis = (RuleBasedBreakIterator )this;
				1477	nonConstThis->makeRuleStatusValid();
				1478	int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
				1479	int32_t numValsToCopy = numVals;
				1480	if (numVals > capacity) {
				1481	status = U_BUFFER_OVERFLOW_ERROR;
				1482	numValsToCopy = capacity;
				1483	}
				1484	int i;
				1485	for (i=0; i<numValsToCopy; i++) {
				1486	fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
				1487	}
				1488	return numVals;
				1489	}
				1490
				1491
				1492
				1493	//-------------------------------------------------------------------------------
				1494	//
				1495	// getBinaryRules Access to the compiled form of the rules,
				1496	// for use by build system tools that save the data
				1497	// for standard iterator types.
				1498	//
				1499	//-------------------------------------------------------------------------------
				1500	const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
				1501	const uint8_t *retPtr = NULL;
				1502	length = 0;
				1503
				1504	if (fData != NULL) {
				1505	retPtr = (const uint8_t *)fData->fHeader;
				1506	length = fData->fHeader->fLength;
				1507	}
				1508	return retPtr;
				1509	}
				1510
				1511
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1512	BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /stackBuffer/,
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1513	int32_t &bufferSize,
				1514	UErrorCode &status)
				1515	{
				1516	if (U_FAILURE(status)){
				1517	return NULL;
				1518	}
				1519
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1520	if (bufferSize == 0) {
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1521	bufferSize = 1; // preflighting for deprecated functionality
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1522	return NULL;
				1523	}
				1524
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1525	BreakIterator *clonedBI = clone();
				1526	if (clonedBI == NULL) {
				1527	status = U_MEMORY_ALLOCATION_ERROR;
				1528	} else {
				1529	status = U_SAFECLONE_ALLOCATED_WARNING;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1530	}
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1531	return (RuleBasedBreakIterator *)clonedBI;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1532	}
				1533
				1534
				1535	//-------------------------------------------------------------------------------
				1536	//
				1537	// isDictionaryChar Return true if the category lookup for this char
				1538	// indicates that it is in the set of dictionary lookup
				1539	// chars.
				1540	//
				1541	// This function is intended for use by dictionary based
				1542	// break iterators.
				1543	//
				1544	//-------------------------------------------------------------------------------
				1545	/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
				1546	if (fData == NULL) {
				1547	return FALSE;
				1548	}
				1549	uint16_t category;
				1550	UTRIE_GET16(&fData->fTrie, c, category);
				1551	return (category & 0x4000) != 0;
				1552	}*/
				1553
				1554
				1555	//-------------------------------------------------------------------------------
				1556	//
				1557	// checkDictionary This function handles all processing of characters in
				1558	// the "dictionary" set. It will determine the appropriate
				1559	// course of action, and possibly set up a cache in the
				1560	// process.
				1561	//
				1562	//-------------------------------------------------------------------------------
				1563	int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
				1564	int32_t endPos,
				1565	UBool reverse) {
				1566	// Reset the old break cache first.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1567	reset();
				1568
Craig Cornelius	54dcd9b	2013-02-15 14:03:14 -0800	[diff] [blame]	1569	// note: code segment below assumes that dictionary chars are in the
				1570	// startPos-endPos range
				1571	// value returned should be next character in sequence
				1572	if ((endPos - startPos) <= 1) {
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1573	return (reverse ? startPos : endPos);
				1574	}
				1575
				1576	// Starting from the starting point, scan towards the proposed result,
				1577	// looking for the first dictionary character (which may be the one
				1578	// we're on, if we're starting in the middle of a range).
				1579	utext_setNativeIndex(fText, reverse ? endPos : startPos);
				1580	if (reverse) {
				1581	UTEXT_PREVIOUS32(fText);
				1582	}
				1583
				1584	int32_t rangeStart = startPos;
				1585	int32_t rangeEnd = endPos;
				1586
				1587	uint16_t category;
				1588	int32_t current;
				1589	UErrorCode status = U_ZERO_ERROR;
				1590	UStack breaks(status);
				1591	int32_t foundBreakCount = 0;
				1592	UChar32 c = utext_current32(fText);
				1593
				1594	UTRIE_GET16(&fData->fTrie, c, category);
				1595
				1596	// Is the character we're starting on a dictionary character? If so, we
				1597	// need to back up to include the entire run; otherwise the results of
				1598	// the break algorithm will differ depending on where we start. Since
				1599	// the result is cached and there is typically a non-dictionary break
				1600	// within a small number of words, there should be little performance impact.
				1601	if (category & 0x4000) {
				1602	if (reverse) {
				1603	do {
				1604	utext_next32(fText); // TODO: recast to work directly with postincrement.
				1605	c = utext_current32(fText);
				1606	UTRIE_GET16(&fData->fTrie, c, category);
				1607	} while (c != U_SENTINEL && (category & 0x4000));
				1608	// Back up to the last dictionary character
				1609	rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
				1610	if (c == U_SENTINEL) {
				1611	// c = fText->last32();
				1612	// TODO: why was this if needed?
				1613	c = UTEXT_PREVIOUS32(fText);
				1614	}
				1615	else {
				1616	c = UTEXT_PREVIOUS32(fText);
				1617	}
				1618	}
				1619	else {
				1620	do {
				1621	c = UTEXT_PREVIOUS32(fText);
				1622	UTRIE_GET16(&fData->fTrie, c, category);
				1623	}
				1624	while (c != U_SENTINEL && (category & 0x4000));
				1625	// Back up to the last dictionary character
				1626	if (c == U_SENTINEL) {
				1627	// c = fText->first32();
				1628	c = utext_current32(fText);
				1629	}
				1630	else {
				1631	utext_next32(fText);
				1632	c = utext_current32(fText);
				1633	}
				1634	rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
				1635	}
				1636	UTRIE_GET16(&fData->fTrie, c, category);
				1637	}
				1638
				1639	// Loop through the text, looking for ranges of dictionary characters.
				1640	// For each span, find the appropriate break engine, and ask it to find
				1641	// any breaks within the span.
				1642	// Note: we always do this in the forward direction, so that the break
				1643	// cache is built in the right order.
				1644	if (reverse) {
				1645	utext_setNativeIndex(fText, rangeStart);
				1646	c = utext_current32(fText);
				1647	UTRIE_GET16(&fData->fTrie, c, category);
				1648	}
				1649	while(U_SUCCESS(status)) {
				1650	while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
				1651	utext_next32(fText); // TODO: tweak for post-increment operation
				1652	c = utext_current32(fText);
				1653	UTRIE_GET16(&fData->fTrie, c, category);
				1654	}
				1655	if (current >= rangeEnd) {
				1656	break;
				1657	}
				1658
				1659	// We now have a dictionary character. Get the appropriate language object
				1660	// to deal with it.
				1661	const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
				1662
				1663	// Ask the language object if there are any breaks. It will leave the text
				1664	// pointer on the other side of its range, ready to search for the next one.
				1665	if (lbe != NULL) {
				1666	foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
				1667	}
				1668
				1669	// Reload the loop variables for the next go-round
				1670	c = utext_current32(fText);
				1671	UTRIE_GET16(&fData->fTrie, c, category);
				1672	}
				1673
				1674	// If we found breaks, build a new break cache. The first and last entries must
				1675	// be the original starting and ending position.
				1676	if (foundBreakCount > 0) {
ccornelius	fceb398	2014-04-16 12:27:14 -0700	[diff] [blame]	1677	U_ASSERT(foundBreakCount == breaks.size());
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1678	int32_t totalBreaks = foundBreakCount;
				1679	if (startPos < breaks.elementAti(0)) {
				1680	totalBreaks += 1;
				1681	}
				1682	if (endPos > breaks.peeki()) {
				1683	totalBreaks += 1;
				1684	}
				1685	fCachedBreakPositions = (int32_t )uprv_malloc(totalBreaks sizeof(int32_t));
				1686	if (fCachedBreakPositions != NULL) {
				1687	int32_t out = 0;
				1688	fNumCachedBreakPositions = totalBreaks;
				1689	if (startPos < breaks.elementAti(0)) {
				1690	fCachedBreakPositions[out++] = startPos;
				1691	}
				1692	for (int32_t i = 0; i < foundBreakCount; ++i) {
				1693	fCachedBreakPositions[out++] = breaks.elementAti(i);
				1694	}
				1695	if (endPos > fCachedBreakPositions[out-1]) {
				1696	fCachedBreakPositions[out] = endPos;
				1697	}
				1698	// If there are breaks, then by definition, we are replacing the original
				1699	// proposed break by one of the breaks we found. Use following() and
				1700	// preceding() to do the work. They should never recurse in this case.
				1701	if (reverse) {
Craig Cornelius	54dcd9b	2013-02-15 14:03:14 -0800	[diff] [blame]	1702	return preceding(endPos);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1703	}
				1704	else {
				1705	return following(startPos);
				1706	}
				1707	}
				1708	// If the allocation failed, just fall through to the "no breaks found" case.
				1709	}
				1710
				1711	// If we get here, there were no language-based breaks. Set the text pointer
				1712	// to the original proposed break.
				1713	utext_setNativeIndex(fText, reverse ? startPos : endPos);
				1714	return (reverse ? startPos : endPos);
				1715	}
				1716
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1717	U_NAMESPACE_END
				1718
				1719
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1720	static icu::UStack *gLanguageBreakFactories = NULL;
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1721	static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1722
				1723	/**
				1724	* Release all static memory held by breakiterator.
				1725	*/
				1726	U_CDECL_BEGIN
				1727	static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
				1728	if (gLanguageBreakFactories) {
				1729	delete gLanguageBreakFactories;
				1730	gLanguageBreakFactories = NULL;
				1731	}
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1732	gLanguageBreakFactoriesInitOnce.reset();
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1733	return TRUE;
				1734	}
				1735	U_CDECL_END
				1736
				1737	U_CDECL_BEGIN
				1738	static void U_CALLCONV _deleteFactory(void *obj) {
Craig Cornelius	103e9ff	2012-10-09 17:03:29 -0700	[diff] [blame]	1739	delete (icu::LanguageBreakFactory *) obj;
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1740	}
				1741	U_CDECL_END
				1742	U_NAMESPACE_BEGIN
				1743
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1744	static void U_CALLCONV initLanguageFactories() {
				1745	UErrorCode status = U_ZERO_ERROR;
				1746	U_ASSERT(gLanguageBreakFactories == NULL);
				1747	gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
				1748	if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
				1749	ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
				1750	gLanguageBreakFactories->push(builtIn, status);
				1751	#ifdef U_LOCAL_SERVICE_HOOK
				1752	LanguageBreakFactory extra = (LanguageBreakFactory )uprv_svc_hook("languageBreakFactory", &status);
				1753	if (extra != NULL) {
				1754	gLanguageBreakFactories->push(extra, status);
				1755	}
				1756	#endif
				1757	}
				1758	ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
				1759	}
				1760
				1761
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1762	static const LanguageBreakEngine*
				1763	getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
				1764	{
ccornelius	59d709d	2014-02-20 10:29:46 -0800	[diff] [blame]	1765	umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1766	if (gLanguageBreakFactories == NULL) {
				1767	return NULL;
				1768	}
				1769
				1770	int32_t i = gLanguageBreakFactories->size();
				1771	const LanguageBreakEngine *lbe = NULL;
				1772	while (--i >= 0) {
				1773	LanguageBreakFactory factory = (LanguageBreakFactory )(gLanguageBreakFactories->elementAt(i));
				1774	lbe = factory->getEngineFor(c, breakType);
				1775	if (lbe != NULL) {
				1776	break;
				1777	}
				1778	}
				1779	return lbe;
				1780	}
				1781
				1782
				1783	//-------------------------------------------------------------------------------
				1784	//
				1785	// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
Craig Cornelius	54dcd9b	2013-02-15 14:03:14 -0800	[diff] [blame]	1786	// the character c.
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1787	//
				1788	//-------------------------------------------------------------------------------
				1789	const LanguageBreakEngine *
				1790	RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
				1791	const LanguageBreakEngine *lbe = NULL;
				1792	UErrorCode status = U_ZERO_ERROR;
				1793
				1794	if (fLanguageBreakEngines == NULL) {
				1795	fLanguageBreakEngines = new UStack(status);
Jean-Baptiste Queru	c69afce	2009-07-20 15:02:39 -0700	[diff] [blame]	1796	if (fLanguageBreakEngines == NULL \|\| U_FAILURE(status)) {
Jean-Baptiste Queru	b13da9d	2009-07-17 17:53:22 -0700	[diff] [blame]	1797	delete fLanguageBreakEngines;
				1798	fLanguageBreakEngines = 0;
				1799	return NULL;
				1800	}
				1801	}
				1802
				1803	int32_t i = fLanguageBreakEngines->size();
				1804	while (--i >= 0) {
				1805	lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
				1806	if (lbe->handles(c, fBreakType)) {
				1807	return lbe;
				1808	}
				1809	}
				1810
				1811	// No existing dictionary took the character. See if a factory wants to
				1812	// give us a new LanguageBreakEngine for this character.
				1813	lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
				1814
				1815	// If we got one, use it and push it on our stack.
				1816	if (lbe != NULL) {
				1817	fLanguageBreakEngines->push((void *)lbe, status);
				1818	// Even if we can't remember it, we can keep looking it up, so
				1819	// return it even if the push fails.
				1820	return lbe;
				1821	}
				1822
				1823	// No engine is forthcoming for this character. Add it to the
				1824	// reject set. Create the reject break engine if needed.
				1825	if (fUnhandledBreakEngine == NULL) {
				1826	fUnhandledBreakEngine = new UnhandledEngine(status);
				1827	if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
				1828	status = U_MEMORY_ALLOCATION_ERROR;
				1829	}
				1830	// Put it last so that scripts for which we have an engine get tried
				1831	// first.
				1832	fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
				1833	// If we can't insert it, or creation failed, get rid of it
				1834	if (U_FAILURE(status)) {
				1835	delete fUnhandledBreakEngine;
				1836	fUnhandledBreakEngine = 0;
				1837	return NULL;
				1838	}
				1839	}
				1840
				1841	// Tell the reject engine about the character; at its discretion, it may
				1842	// add more than just the one character.
				1843	fUnhandledBreakEngine->handleCharacter(c, fBreakType);
				1844
				1845	return fUnhandledBreakEngine;
				1846	}
				1847
				1848
				1849
				1850	/*int32_t RuleBasedBreakIterator::getBreakType() const {
				1851	return fBreakType;
				1852	}*/
				1853
				1854	void RuleBasedBreakIterator::setBreakType(int32_t type) {
				1855	fBreakType = type;
				1856	reset();
				1857	}
				1858
				1859	U_NAMESPACE_END
				1860
				1861	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */