Export libtextclassifier
Test: atest TextClassifierServiceTest
BUG: 151413366
Change-Id: Ic93a18c7c30a7978313245c7b28845a47ed03570
diff --git a/native/annotator/annotator.cc b/native/annotator/annotator.cc
index ce46786..eca8ab8 100644
--- a/native/annotator/annotator.cc
+++ b/native/annotator/annotator.cc
@@ -137,8 +137,6 @@
fb_annotation_options->enable_date_range();
result_annotation_options.include_preposition =
fb_annotation_options->include_preposition();
- result_annotation_options.expand_date_series =
- fb_annotation_options->expand_date_series();
if (fb_annotation_options->extra_requested_dates() != nullptr) {
for (const auto& extra_requested_date :
*fb_annotation_options->extra_requested_dates()) {
@@ -1065,6 +1063,12 @@
return false;
}
+ // A PERSONNAME entity does not conflict with anything.
+ if ((source_mask &
+ (1 << static_cast<int>(AnnotatedSpan::Source::PERSON_NAME)))) {
+ return false;
+ }
+
// Entities from other sources can conflict.
return true;
}
@@ -1761,6 +1765,7 @@
person_name_engine_->ClassifyText(context, selection_indices,
&person_name_result)) {
candidates.push_back({selection_indices, {person_name_result}});
+ candidates.back().source = AnnotatedSpan::Source::PERSON_NAME;
}
// Try the installed app engine.
diff --git a/native/annotator/grammar/dates/annotations/annotation-options.h b/native/annotator/grammar/dates/annotations/annotation-options.h
index d5445fe..29e9939 100755
--- a/native/annotator/grammar/dates/annotations/annotation-options.h
+++ b/native/annotator/grammar/dates/annotations/annotation-options.h
@@ -59,16 +59,6 @@
// instance: "Monday" and "6pm".
bool enable_date_range;
- // If enabled, expand a date series. Must have date_range enabled to be used.
- // The date range cannot exceed 30 days.
- // input: April 4-6, 6:30pm
- // If the flag is true, the extracted annotation will contaly 3 instance
- // which are April 4 at 6:30pm, April 5 at 6:30pm and April 6 at 6:30pm
- // all have the same begin and end annotation
- // If the flag is false, the extracted annotation contains one time range
- // instance and one date instance
- bool expand_date_series;
-
// Timezone in which the input text was written
std::string reference_timezone;
// Localization params.
@@ -98,7 +88,6 @@
include_preposition(false),
base_timestamp_millis(0),
enable_date_range(false),
- expand_date_series(false),
use_rule_priority_score(false),
generate_alternative_interpretations_when_ambiguous(false) {}
};
diff --git a/native/annotator/grammar/dates/parser.cc b/native/annotator/grammar/dates/parser.cc
index 7587b0b..566827e 100644
--- a/native/annotator/grammar/dates/parser.cc
+++ b/native/annotator/grammar/dates/parser.cc
@@ -234,7 +234,7 @@
// Copies the field from one DateMatch to another whose field is null. for
// example: if the from is "May 1, 8pm", and the to is "9pm", "May 1" will be
-// copied to "to". Now we only copy fields for date range requirement.
+// copied to "to". Now we only copy fields for date range requirement.fv
void CopyFieldsForDateMatch(const DateMatch& from, DateMatch* to) {
if (from.time_span_match != nullptr && to->time_span_match == nullptr) {
to->time_span_match = from.time_span_match;
@@ -743,101 +743,6 @@
return number_of_days > kMaximumExpansion;
}
-// Expands a date range and merges it with a time.
-// e.g. April 4-6, 2:00pm will be expanded into April 4 at 2pm, April 5 at 2pm
-// and April 6 at 2:00pm
-// - Only supports a range of days with a time
-// - Does not expand a date range without time
-void ExpandDateRangeAndMergeWithTime(
- const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text,
- const std::vector<std::string>& ignored_spans,
- std::vector<DateMatch>* times, std::vector<DateRangeMatch>* date_ranges) {
- auto next_time = times->begin();
- auto next_range = date_ranges->begin();
- while (next_range != date_ranges->end() && next_time != times->end()) {
- const DateRangeMatch& range = *next_range;
- if (range.from.HasHour() || !IsPrecedent(range.from, range.to)) {
- ++next_range;
- continue;
- }
-
- while (next_time != times->end()) {
- const DateMatch& time = *next_time;
- if (!time.IsStandaloneTime()) {
- ++next_time;
- continue;
- }
-
- // The range is before the time
- if (range.end <= time.begin) {
- if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans,
- range.to, time) &&
- !IsDateRangeTooLong(range)) {
- std::vector<DateMatch> expanded_dates;
- ExpandDateRange(range, &expanded_dates);
-
- // Merge the expaneded date and with time
- std::vector<DateMatch> merged_times;
- for (const auto& expanded_date : expanded_dates) {
- DateMatch merged_time = time;
- MergeDateMatch(expanded_date, &merged_time, true);
- merged_times.push_back(merged_time);
- }
- // Insert the expanded time before next_time and move next_time point
- // to previous time.
- next_time = times->insert(next_time, merged_times.begin(),
- merged_times.end());
- next_time += merged_times.size();
-
- // Remove merged time. now next_time point to the time after the
- // merged time.
- next_time = times->erase(next_time);
- // Remove merged range, now next_range point to the range after the
- // merged range.
- next_range = date_ranges->erase(next_range);
- } else {
- // range is behind time, check next range.
- ++next_range;
- }
- break;
- } else if (range.end > time.end && range.begin > time.begin) {
- // The range is after the time
- if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans,
- time, range.from) &&
- !IsDateRangeTooLong(range)) {
- std::vector<DateMatch> expanded_dates;
- ExpandDateRange(range, &expanded_dates);
-
- // Merge the expaneded dates with time
- for (auto& expanded_date : expanded_dates) {
- MergeDateMatch(time, &expanded_date, true);
- }
- // Insert expanded time before next_time and move next_time point to
- // previous time.
- next_time = times->insert(next_time, expanded_dates.begin(),
- expanded_dates.end());
- next_time += expanded_dates.size();
-
- // Remove merged time. Now next_time point to the time after the
- // merged time.
- next_time = times->erase(next_time);
- // Remove merged range. Now next_range point to the range after the
- // merged range.
- next_range = date_ranges->erase(next_range);
- break;
- } else {
- // Since the range is after the time, we need to check the next time
- // first
- ++next_time;
- }
- } else {
- // Range fully overlaps with time In this case, we move to the next time
- ++next_time;
- }
- }
- }
-}
-
// Fills `DateTimes` proto from matched `DateMatch` and `DateRangeMatch`
// instances.
std::vector<Annotation> GetOutputAsAnnotationList(
@@ -877,25 +782,12 @@
MergeDateRangeAndDate(unilib, text, options.ignored_spans, date_matches,
&date_range_matches);
RemoveOverlappedDateByRange(date_range_matches, &date_matches);
-
- if (options.expand_date_series) {
- ExpandDateRangeAndMergeWithTime(unilib, text, options.ignored_spans,
- &date_matches, &date_range_matches);
- }
}
FillDateRangeInstances(date_range_matches, &date_annotations);
}
if (!date_matches.empty()) {
FillDateInstances(unilib, text, options, &date_matches, &date_annotations);
-
- int64 timestamp_ms = options.base_timestamp_millis;
- if (timestamp_ms > 0) {
- // The timestamp in options is milliseconds, the time_t is seconds from
- // 00:00 Jan 1 1970 UTC.
- time_t base_timestamp = timestamp_ms / 1000;
- NormalizeDateTimes(base_timestamp, &date_annotations);
- }
}
return date_annotations;
}
diff --git a/native/annotator/grammar/dates/utils/date-utils.cc b/native/annotator/grammar/dates/utils/date-utils.cc
index 02f4873..5a68838 100644
--- a/native/annotator/grammar/dates/utils/date-utils.cc
+++ b/native/annotator/grammar/dates/utils/date-utils.cc
@@ -360,404 +360,6 @@
}
namespace {
-int NormalizeField(int base, int zero, int* valp, int carry_in) {
- int carry_out = 0;
- int val = *valp;
- if (zero != 0 && val < 0) {
- val += base;
- carry_out -= 1;
- }
- val -= zero;
- carry_out += val / base;
- int rem = val % base;
- if (carry_in != 0) {
- carry_out += carry_in / base;
- rem += carry_in % base;
- if (rem < 0) {
- carry_out -= 1;
- rem += base;
- } else if (rem >= base) {
- carry_out += 1;
- rem -= base;
- }
- }
- if (rem < 0) {
- carry_out -= 1;
- rem += base;
- }
- *valp = rem + zero;
- return carry_out;
-}
-
-int DaysPerYear(int year) {
- if (IsLeapYear(year)) {
- return DAYSPERLYEAR;
- }
- return DAYSPERNYEAR;
-}
-
-const int8 kDaysPer100Years[401] = {
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-};
-
-int DaysPer100Years(int eyear) { return 36524 + kDaysPer100Years[eyear]; }
-
-const int8 kDaysPer4Years[401] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-};
-
-int DaysPer4Years(int eyear) { return 1460 + kDaysPer4Years[eyear]; }
-
-#define DAYORDYEARMAX (25252734927766553LL)
-#define DAYORDYEARMIN (-25252734927764584LL)
-
-// Normalize year, month, day, hour, minute and second to valid value. For
-// example: 1hour 15minute 62second is normalized as 1hour 16 minute 2second.
-bool NormalizeDateFields(int* year, int* month, int* day, int* hour,
- int* minute, int* second) {
- int min_carry = NormalizeField(SECSPERMIN, 0, second, 0);
- int hour_carry = NormalizeField(MINSPERHOUR, 0, minute, min_carry);
- int day_carry = NormalizeField(HOURSPERDAY, 0, hour, hour_carry);
- int year_carry = NormalizeField(MONSPERYEAR, 1, month, 0);
- bool normalized = min_carry || hour_carry || day_carry || year_carry;
-
- // Normalize the number of days within a 400-year (146097-day) period.
- if (int c4_carry = NormalizeField(146097, 1, day, day_carry)) {
- year_carry += c4_carry * 400;
- normalized = true;
- }
-
- // Extract a [0:399] year calendrically equivalent to (year + year_carry)
- // from that sum in order to simplify year/day normalization and to defer
- // the possibility of int64 overflow until the final stage.
- int eyear = *year % 400;
- if (year_carry != 0) {
- eyear += year_carry;
- eyear %= 400;
- }
- if (eyear < 0) eyear += 400;
- year_carry -= eyear;
-
- int orig_day = *day;
- if (*day > DAYSPERNYEAR) {
- eyear += (*month > 2 ? 1 : 0);
- if (*day > 146097 - DAYSPERNYEAR) {
- // We often hit the 400th year when stepping a civil time backwards,
- // so special case it to avoid counting up by 100/4/1 year chunks.
- *day = DaysPerYear(eyear += 400 - 1) - (146097 - *day);
- } else {
- // Handle days in chunks of 100/4/1 years.
- for (int ydays = DaysPer100Years(eyear); *day > ydays;
- *day -= ydays, ydays = DaysPer100Years(eyear)) {
- if ((eyear += 100) > 400) {
- eyear -= 400;
- year_carry += 400;
- }
- }
- for (int ydays = DaysPer4Years(eyear); *day > ydays;
- *day -= ydays, ydays = DaysPer4Years(eyear)) {
- if ((eyear += 4) > 400) {
- eyear -= 400;
- year_carry += 400;
- }
- }
- for (int ydays = DaysPerYear(eyear); *day > ydays;
- *day -= ydays, ydays = DaysPerYear(eyear)) {
- eyear += 1;
- }
- }
- eyear -= (*month > 2 ? 1 : 0);
- }
- // Handle days within one year.
- bool leap_year = IsLeapYear(eyear);
- for (int mdays = kDaysPerMonth[leap_year][*month]; *day > mdays;
- *day -= mdays, mdays = kDaysPerMonth[leap_year][*month]) {
- if (++*month > MONSPERYEAR) {
- *month = 1;
- leap_year = IsLeapYear(++eyear);
- }
- }
- if (*day != orig_day) normalized = true;
-
- // Add the updated eyear back into (year + year_carry).
- year_carry += eyear;
- // Overflow.
- if (*year > DAYORDYEARMAX - year_carry) {
- return false;
- } else if (*year < DAYORDYEARMIN - year_carry) {
- return false;
- }
- *year += year_carry;
- return true;
-}
-
-// Compute the day difference between the day of week in relative date and wday.
-// If the relative date is in future, return positive days. otherwise return the
-// negative future. For example:
-// if day of week in relative date is Mon this week and wday is Wed this week,
-// then return -2.
-// if day of week in relative date is Wed this week and wday is Mon this week,
-// then return 2.
-int32 RelativeDOWToDays(const Property& rd, const int wday) {
- int days = -1;
- int multiplier = 1;
- for (int i = 9; i < rd.int_values.size(); ++i) {
- int inter = rd.int_values.at(i);
- int dow = rd.int_values.at(8) - 1;
- int interval = 0;
- int cur_multiplier = 1;
- if (inter == RelativeParameter_::Interpretation_NEAREST_LAST ||
- inter == RelativeParameter_::Interpretation_PREVIOUS) {
- // Represent the DOW in the last week.
- cur_multiplier = -1;
- if (dow <= wday) {
- interval = 7 + (wday - dow);
- } else {
- interval = 7 - (dow - wday);
- }
- } else if (inter == RelativeParameter_::Interpretation_SECOND_LAST) {
- // Represent the DOW in the week before last week.
- cur_multiplier = -1;
- if (dow <= wday) {
- interval = 14 + (wday - dow);
- } else {
- interval = 14 - (dow - wday);
- }
- } else if (inter == RelativeParameter_::Interpretation_NEAREST_NEXT ||
- inter == RelativeParameter_::Interpretation_COMING) {
- // Represent the DOW in the next week.
- cur_multiplier = 1;
- if (dow <= wday) {
- interval = 7 - (wday - dow);
- } else {
- interval = 7 + (dow - wday);
- }
- // Represent the DOW in the week of next week.
- } else if (inter == RelativeParameter_::Interpretation_SECOND_NEXT) {
- cur_multiplier = 1;
- if (dow <= wday) {
- interval = 14 - (wday - dow);
- } else {
- interval = 14 + (dow - wday);
- }
- // Represent the DOW in the same week regardless of it's past of future.
- } else if (inter == RelativeParameter_::Interpretation_CURRENT ||
- inter == RelativeParameter_::Interpretation_NEAREST ||
- inter == RelativeParameter_::Interpretation_SOME) {
- interval = abs(wday - dow);
- cur_multiplier = dow < wday ? -1 : 1;
- }
- if (days == -1 || interval < days) {
- days = interval;
- multiplier = cur_multiplier;
- }
- }
- return days * multiplier;
-}
-
-// Compute the absolute date and time based on timestamp and relative date and
-// fill the fields year, month, day, hour, minute and second.
-bool RelativeDateToAbsoluteDate(struct tm ts, AnnotationData* date) {
- int idx = GetPropertyIndex(kDateTimeRelative, *date);
- if (idx < 0) {
- return false;
- }
- Property* datetime = FindOrCreateDefaultDateTime(date);
- Property* relative = &date->properties[idx];
- int year = ts.tm_year + 1900; // The year in struct tm is since 1900
- int month = ts.tm_mon + 1; // Convert to [1, 12]
- int day = ts.tm_mday;
- int hour = ts.tm_hour;
- int minute = ts.tm_min;
- int second = ts.tm_sec;
- // If the instance has time, it doesn't make sense to update time based on
- // relative time. so we simply clear the time in relative date.
- // For example: 2 days 1 hours ago at 10:00am, the 1 hours will be removed.
- if (datetime->int_values[3] > 0) {
- relative->int_values[5] = -1;
- relative->int_values[6] = -1;
- relative->int_values[7] = -1;
- }
-
- // Get the relative year, month, day, hour, minute and second.
- if (relative->int_values[8] > 0) {
- day += RelativeDOWToDays(*relative, ts.tm_wday);
- } else {
- int multipler = (relative->int_values[0] > 0) ? 1 : -1;
- if (relative->int_values[1] > 0) {
- year += relative->int_values[1] * multipler;
- }
- if (relative->int_values[2] > 0) {
- month += relative->int_values[2] * multipler;
- }
- if (relative->int_values[3] > 0) {
- day += relative->int_values[3] * multipler;
- }
- if (relative->int_values[5] > 0) {
- hour += relative->int_values[5] * multipler;
- }
- if (relative->int_values[6] > 0) {
- minute += relative->int_values[6] * multipler;
- }
- if (relative->int_values[7] > 0) {
- second += relative->int_values[7] * multipler;
- }
- }
-
- if (!NormalizeDateFields(&year, &month, &day, &hour, &minute, &second)) {
- TC3_VLOG(1) << "Can not normalize date " << year << "-" << month << "-"
- << day << " " << hour << ":" << minute << ":" << second;
- return false;
- }
-
- // Update year, month, day, hour, minute and second of date instance. We only
- // update the time unit if the relative date has it. For example:
- // if the relative date is "1 hour ago", then we don't set minite and second
- // in data intance, but we set hour and the time unit which is larger than
- // hour like day, month and year.
- // if the relative date is "1 year ago", we only update year in date instance
- // and ignore others.
- bool set = false;
- if (relative->int_values[7] >= 0) {
- set = true;
- datetime->int_values[5] = second;
- }
- if (set || relative->int_values[6] >= 0) {
- set = true;
- datetime->int_values[4] = minute;
- }
- if (set || relative->int_values[5] >= 0) {
- set = true;
- datetime->int_values[3] = hour;
- }
- if (set || relative->int_values[3] >= 0 || relative->int_values[8] >= 0) {
- set = true;
- datetime->int_values[2] = day;
- }
- if (set || relative->int_values[2] >= 0) {
- set = true;
- datetime->int_values[1] = month;
- }
- if (set || relative->int_values[1] >= 0) {
- set = true;
- datetime->int_values[0] = year;
- }
- return true;
-}
-
-// If the year is less than 100 and has no bc/ad, it should be normalized.
-static constexpr int kMinYearForNormalization = 100;
-
-// Normalize date instance.
-void NormalizeDateInstance(time_t timestamp, AnnotationData* inst) {
- struct tm ts;
- localtime_r(×tamp, &ts);
-
- int idx = GetPropertyIndex(kDateTime, *inst);
- if (idx >= 0) {
- Property* datetime = &inst->properties[idx];
- int bc_ad = -1;
- idx = GetPropertyIndex(kDateTimeSupplementary, *inst);
- if (idx >= 0) {
- bc_ad = inst->properties[idx].int_values[0];
- }
-
- int year = datetime->int_values[0];
- if (bc_ad < 0 && year > 0 && year < kMinYearForNormalization) {
- if (2000 + year <= ts.tm_year + 1900) {
- datetime->int_values[0] = 2000 + year;
- } else {
- datetime->int_values[0] = 1900 + year;
- }
- }
- // Day-of-week never only appear in date instance, it must be in both
- // relative date and non-relative date. If the date instance already has day
- // like "Monday, March 19", it doesn't make sense to convert the dow to
- // absolute date again.
- if (datetime->int_values[7] > 0 && datetime->int_values[2] > 0) {
- return;
- }
- }
- RelativeDateToAbsoluteDate(ts, inst);
-}
-
-// Convert normalized date instance to unix time.
-time_t DateInstanceToUnixTimeInternal(time_t timestamp,
- const AnnotationData& inst) {
- int idx = GetPropertyIndex(kDateTime, inst);
- if (idx < 0) {
- return -1;
- }
- const Property& prop = inst.properties[idx];
-
- struct tm ts;
- localtime_r(×tamp, &ts);
-
- if (prop.int_values[0] > 0) {
- ts.tm_year = prop.int_values[0] - 1900;
- }
- if (prop.int_values[1] > 0) {
- ts.tm_mon = prop.int_values[1] - 1;
- }
- if (prop.int_values[2] > 0) {
- ts.tm_mday = prop.int_values[2];
- }
- if (prop.int_values[3] > 0) {
- ts.tm_hour = prop.int_values[3];
- }
- if (prop.int_values[4] > 0) {
- ts.tm_min = prop.int_values[4];
- }
- if (prop.int_values[5] > 0) {
- ts.tm_sec = prop.int_values[5];
- }
- ts.tm_wday = -1;
- ts.tm_yday = -1;
- return mktime(&ts);
-}
-} // namespace
-
-void NormalizeDateTimes(time_t timestamp, std::vector<Annotation>* dates) {
- for (int i = 0; i < dates->size(); ++i) {
- if ((*dates)[i].data.type == kDateTimeType) {
- NormalizeDateInstance(timestamp, &(*dates)[i].data);
- }
- }
-}
-
-namespace {
bool AnyOverlappedField(const DateMatch& prev, const DateMatch& next) {
#define Field(f) \
if (prev.f && next.f) return true
diff --git a/native/annotator/grammar/dates/utils/date-utils.h b/native/annotator/grammar/dates/utils/date-utils.h
index de459ea..5d4fdca 100644
--- a/native/annotator/grammar/dates/utils/date-utils.h
+++ b/native/annotator/grammar/dates/utils/date-utils.h
@@ -62,12 +62,6 @@
// from matched rule.
void FillDateRangeInstance(const DateRangeMatch& range, Annotation* instance);
-// Normalize DateTimes based on timestamp.
-// Currently it does two things:
-// -- Convert relative date to absolute date
-// -- Normalize year if year is two digit
-void NormalizeDateTimes(time_t timestamp, std::vector<Annotation>* dates);
-
// Merge the fields in DateMatch prev to next if there is no overlapped field.
// If update_span is true, the span of next is also updated.
// e.g.: prev is 11am, next is: May 1, then the merged next is May 1, 11am
diff --git a/native/annotator/model.fbs b/native/annotator/model.fbs
index f5a241f..31fac49 100755
--- a/native/annotator/model.fbs
+++ b/native/annotator/model.fbs
@@ -396,16 +396,6 @@
// instance: "Monday" and "6pm".
enable_date_range:bool = true;
- // If enabled, expand a date series. Must have date_range enabled to be
- // used. The date range cannot exceed 30 days.
- // input: April 4-6, 6:30pm
- // If the flag is true, the extracted annotation will contain 3 instance
- // which are April 4 at 6:30pm, April 5 at 6:30pm and April 6 at 6:30pm
- // all have the same begin and end annotation
- // If the flag is false, the extracted annotation contains one time
- // range instance and one date instance
- expand_date_series:bool = true;
-
// If enabled, the rule priority score is used to set the priority score of
// the annotation.
// In case of false the annotation priority score is set from
@@ -834,7 +824,7 @@
namespace libtextclassifier3;
table NumberAnnotatorOptions {
- // If true, number annotations will be produced.
+ // If true, number and percentage annotations will be produced.
enabled:bool = false;
// Score to assign to the annotated numbers and percentages in the annotator.
@@ -843,32 +833,34 @@
// Number priority score used for conflict resolution with the other models.
priority_score:float = 0;
- // The modes in which to enable number annotations.
+ // The modes in which to enable number and percentage annotations.
enabled_modes:ModeFlag = ALL;
// The annotation usecases for which to produce number annotations.
// This is a flag field for values of AnnotationUsecase.
enabled_annotation_usecases:uint = 4294967295;
- // A list of codepoints that can form a prefix of a valid number.
+ // [Deprecated] A list of codepoints that can form a prefix of a valid number.
allowed_prefix_codepoints:[int];
- // A list of codepoints that can form a suffix of a valid number.
+ // [Deprecated] A list of codepoints that can form a suffix of a valid number.
allowed_suffix_codepoints:[int];
- // List of codepoints that will be stripped from beginning of predicted spans.
+ // [Deprecated] List of codepoints that will be stripped from beginning of
+ // predicted spans.
ignored_prefix_span_boundary_codepoints:[int];
- // List of codepoints that will be stripped from end of predicted spans.
+ // [Deprecated] List of codepoints that will be stripped from end of predicted
+ // spans.
ignored_suffix_span_boundary_codepoints:[int];
- // If true, percent annotations will be produced.
+ // [Deprecated] If true, percent annotations will be produced.
enable_percentage:bool = false;
// Zero separated and ordered list of suffixes that mark a percent.
percentage_pieces_string:string (shared);
- // List of suffixes offsets in the percent_pieces_string string.
+ // [Deprecated] List of suffixes offsets in the percent_pieces_string string.
percentage_pieces_offsets:[int];
// Priority score for the percentage annotation.
@@ -881,6 +873,10 @@
// The maximum number of digits an annotated number can have. Requirement:
// the value should be less or equal to 20.
max_number_of_digits:int = 20;
+
+ // The annotation usecases for which to produce percentage annotations.
+ // This is a flag field for values of AnnotationUsecase.
+ percentage_annotation_usecases:uint = 2;
}
// DurationAnnotator is so far tailored for English and Japanese only.
diff --git a/native/annotator/number/number.cc b/native/annotator/number/number.cc
index fe986ae..3be6ad8 100644
--- a/native/annotator/number/number.cc
+++ b/native/annotator/number/number.cc
@@ -23,6 +23,7 @@
#include "annotator/collections.h"
#include "annotator/types.h"
#include "utils/base/logging.h"
+#include "utils/strings/split.h"
#include "utils/utf8/unicodetext.h"
namespace libtextclassifier3 {
@@ -149,9 +150,8 @@
UTF8ToUnicodeText(tokens[suffix_start_index].value, /*do_copy=*/false)
.begin();
- if (GetPercentSuffixLength(UTF8ToUnicodeText(tokens[suffix_start_index].value,
- /*do_copy=*/false),
- 0) > 0 &&
+ if (percent_suffixes_.find(tokens[suffix_start_index].value) !=
+ percent_suffixes_.end() &&
TokensAreValidEnding(tokens, suffix_start_index + 1)) {
return true;
}
@@ -175,6 +175,25 @@
return false;
}
+int NumberAnnotator::FindPercentSuffixEndCodepoint(
+ const std::vector<Token>& tokens,
+ const int suffix_token_start_index) const {
+ if (suffix_token_start_index >= tokens.size()) {
+ return -1;
+ }
+
+ if (percent_suffixes_.find(tokens[suffix_token_start_index].value) !=
+ percent_suffixes_.end() &&
+ TokensAreValidEnding(tokens, suffix_token_start_index + 1)) {
+ return tokens[suffix_token_start_index].end;
+ }
+ if (tokens[suffix_token_start_index].is_whitespace) {
+ return FindPercentSuffixEndCodepoint(tokens, suffix_token_start_index + 1);
+ }
+
+ return -1;
+}
+
bool NumberAnnotator::TryParseNumber(const UnicodeText& token_text,
const bool is_negative,
int64* parsed_int_value,
@@ -198,8 +217,7 @@
bool NumberAnnotator::FindAll(const UnicodeText& context,
AnnotationUsecase annotation_usecase,
std::vector<AnnotatedSpan>* result) const {
- if (!options_->enabled() || ((1 << annotation_usecase) &
- options_->enabled_annotation_usecases()) == 0) {
+ if (!options_->enabled()) {
return true;
}
@@ -230,80 +248,67 @@
}
const bool has_decimal = !(parsed_int_value == parsed_double_value);
+ const int new_start_codepoint = is_negative ? token.start - 1 : token.start;
- ClassificationResult classification{Collections::Number(),
- options_->score()};
- classification.numeric_value = parsed_int_value;
- classification.numeric_double_value = parsed_double_value;
- classification.priority_score =
- has_decimal ? options_->float_number_priority_score()
- : options_->priority_score();
+ if (((1 << annotation_usecase) & options_->enabled_annotation_usecases()) !=
+ 0) {
+ result->push_back(CreateAnnotatedSpan(
+ new_start_codepoint, token.end, parsed_int_value, parsed_double_value,
+ Collections::Number(), options_->score(),
+ /*priority_score=*/
+ has_decimal ? options_->float_number_priority_score()
+ : options_->priority_score()));
+ }
- AnnotatedSpan annotated_span;
- annotated_span.span = {is_negative ? token.start - 1 : token.start,
- token.end};
- annotated_span.classification.push_back(classification);
- result->push_back(annotated_span);
- }
-
- if (options_->enable_percentage()) {
- FindPercentages(context, result);
+ const int percent_end_codepoint =
+ FindPercentSuffixEndCodepoint(tokens, i + 1);
+ if (percent_end_codepoint != -1 &&
+ ((1 << annotation_usecase) &
+ options_->percentage_annotation_usecases()) != 0) {
+ result->push_back(CreateAnnotatedSpan(
+ new_start_codepoint, percent_end_codepoint, parsed_int_value,
+ parsed_double_value, Collections::Percentage(), options_->score(),
+ options_->percentage_priority_score()));
+ }
}
return true;
}
-std::vector<uint32> NumberAnnotator::FlatbuffersIntVectorToStdVector(
- const flatbuffers::Vector<int32_t>* ints) {
- if (ints == nullptr) {
- return {};
- }
- return {ints->begin(), ints->end()};
+AnnotatedSpan NumberAnnotator::CreateAnnotatedSpan(
+ const int start, const int end, const int int_value,
+ const double double_value, const std::string collection, const float score,
+ const float priority_score) const {
+ ClassificationResult classification{collection, score};
+ classification.numeric_value = int_value;
+ classification.numeric_double_value = double_value;
+ classification.priority_score = priority_score;
+
+ AnnotatedSpan annotated_span;
+ annotated_span.span = {start, end};
+ annotated_span.classification.push_back(classification);
+ return annotated_span;
}
-int NumberAnnotator::GetPercentSuffixLength(const UnicodeText& context,
- int index_codepoints) const {
- if (index_codepoints >= context.size_codepoints()) {
- return -1;
+std::unordered_set<std::string>
+NumberAnnotator::FromFlatbufferStringToUnordredSet(
+ const flatbuffers::String* flatbuffer_percent_strings) {
+ std::unordered_set<std::string> strings_set;
+ if (flatbuffer_percent_strings == nullptr) {
+ return strings_set;
}
- auto context_it = context.begin();
- std::advance(context_it, index_codepoints);
- const StringPiece suffix_context(
- context_it.utf8_data(),
- std::distance(context_it.utf8_data(), context.end().utf8_data()));
- StringSet::Match match;
- percentage_suffixes_trie_.LongestPrefixMatch(suffix_context, &match);
- if (match.match_length == -1) {
- return match.match_length;
- } else {
- return UTF8ToUnicodeText(context_it.utf8_data(), match.match_length,
- /*do_copy=*/false)
- .size_codepoints();
+ const std::string percent_strings = flatbuffer_percent_strings->str();
+ for (StringPiece suffix : strings::Split(percent_strings, '\0')) {
+ std::string percent_suffix = suffix.ToString();
+ percent_suffix.erase(
+ std::remove_if(percent_suffix.begin(), percent_suffix.end(),
+ [](unsigned char x) { return std::isspace(x); }),
+ percent_suffix.end());
+ strings_set.insert(percent_suffix);
}
-}
-void NumberAnnotator::FindPercentages(
- const UnicodeText& context, std::vector<AnnotatedSpan>* result) const {
- const int initial_result_size = result->size();
- for (int i = 0; i < initial_result_size; ++i) {
- AnnotatedSpan annotated_span = (*result)[i];
- if (annotated_span.classification.empty() ||
- annotated_span.classification[0].collection != Collections::Number()) {
- continue;
- }
-
- const int match_length =
- GetPercentSuffixLength(context, annotated_span.span.second);
- if (match_length > 0) {
- annotated_span.span = {annotated_span.span.first,
- annotated_span.span.second + match_length};
- annotated_span.classification[0].collection = Collections::Percentage();
- annotated_span.classification[0].priority_score =
- options_->percentage_priority_score();
- result->push_back(annotated_span);
- }
- }
+ return strings_set;
}
} // namespace libtextclassifier3
diff --git a/native/annotator/number/number.h b/native/annotator/number/number.h
index 6022063..d83bea0 100644
--- a/native/annotator/number/number.h
+++ b/native/annotator/number/number.h
@@ -46,17 +46,8 @@
/*internal_tokenizer_codepoint_ranges=*/{},
/*split_on_script_change=*/false,
/*icu_preserve_whitespace_tokens=*/true)),
- percentage_pieces_string_(
- (options->percentage_pieces_string() == nullptr)
- ? StringPiece()
- : StringPiece(options->percentage_pieces_string()->data(),
- options->percentage_pieces_string()->size())),
- percentage_pieces_offsets_(FlatbuffersIntVectorToStdVector(
- options->percentage_pieces_offsets())),
- percentage_suffixes_trie_(
- SortedStringsTable(/*num_pieces=*/percentage_pieces_offsets_.size(),
- /*offsets=*/percentage_pieces_offsets_.data(),
- /*pieces=*/percentage_pieces_string_)),
+ percent_suffixes_(FromFlatbufferStringToUnordredSet(
+ options_->percentage_pieces_string())),
max_number_of_digits_(options->max_number_of_digits()) {}
// Classifies given text, and if it is a number, it passes the result in
@@ -71,12 +62,10 @@
std::vector<AnnotatedSpan>* result) const;
private:
- static std::vector<uint32> FlatbuffersIntVectorToStdVector(
- const flatbuffers::Vector<int32_t>* ints);
-
- // Get the length of the percent suffix at the specified index in the context.
- int GetPercentSuffixLength(const UnicodeText& context,
- int index_codepoints) const;
+ // Converts a Flatbuffer string containing zero-separated percent suffixes
+ // to an unordered set.
+ static std::unordered_set<std::string> FromFlatbufferStringToUnordredSet(
+ const flatbuffers::String* flatbuffer_percent_strings);
// Checks if the annotated numbers from the context represent percentages.
// If yes, replaces the collection type and the annotation boundary in the
@@ -87,38 +76,46 @@
// Checks if the tokens from in the interval [start_index-2, start_index] are
// valid characters that can preced a number context.
bool TokensAreValidStart(const std::vector<Token>& tokens,
- const int start_index) const;
+ int start_index) const;
// Checks if the tokens in the interval (..., prefix_end_index] are a valid
// number prefix.
bool TokensAreValidNumberPrefix(const std::vector<Token>& tokens,
- const int prefix_end_index) const;
+ int prefix_end_index) const;
// Checks if the tokens from in the interval [ending_index, ending_index+2]
// are valid characters that can follow a number context.
bool TokensAreValidEnding(const std::vector<Token>& tokens,
- const int ending_index) const;
+ int ending_index) const;
// Checks if the tokens in the interval [suffix_start_index, ...) are a valid
// number suffix.
bool TokensAreValidNumberSuffix(const std::vector<Token>& tokens,
- const int suffix_start_index) const;
+ int suffix_start_index) const;
+
+ // Checks if the tokens in the interval [suffix_start_index, ...) are a valid
+ // percent suffix. If false, returns -1, else returns the end codepoint.
+ int FindPercentSuffixEndCodepoint(const std::vector<Token>& tokens,
+ int suffix_token_start_index) const;
// Checks if the given text represents a number (either int or double).
- bool TryParseNumber(const UnicodeText& token_text, const bool is_negative,
+ bool TryParseNumber(const UnicodeText& token_text, bool is_negative,
int64* parsed_int_value,
double* parsed_double_value) const;
// Checks if a word contains only CJT characters.
bool IsCJTterm(UnicodeText::const_iterator token_begin_it,
- const int token_length) const;
+ int token_length) const;
+
+ AnnotatedSpan CreateAnnotatedSpan(int start, int end, int int_value,
+ double double_value,
+ const std::string collection, float score,
+ float priority_score) const;
const NumberAnnotatorOptions* options_;
const UniLib* unilib_;
const Tokenizer tokenizer_;
- const StringPiece percentage_pieces_string_;
- const std::vector<uint32> percentage_pieces_offsets_;
- const SortedStringsTable percentage_suffixes_trie_;
+ const std::unordered_set<std::string> percent_suffixes_;
const int max_number_of_digits_;
};
diff --git a/native/annotator/test_data/test_grammar_model.fb b/native/annotator/test_data/test_grammar_model.fb
index d6affd3..73afd79 100644
--- a/native/annotator/test_data/test_grammar_model.fb
+++ b/native/annotator/test_data/test_grammar_model.fb
Binary files differ
diff --git a/native/annotator/test_data/test_model.fb b/native/annotator/test_data/test_model.fb
index 6462e9c..5af8e02 100644
--- a/native/annotator/test_data/test_model.fb
+++ b/native/annotator/test_data/test_model.fb
Binary files differ
diff --git a/native/annotator/test_data/wrong_embeddings.fb b/native/annotator/test_data/wrong_embeddings.fb
index a9815ea..e79ae86 100644
--- a/native/annotator/test_data/wrong_embeddings.fb
+++ b/native/annotator/test_data/wrong_embeddings.fb
Binary files differ
diff --git a/native/annotator/types.h b/native/annotator/types.h
index 0aba85a..60872f1 100644
--- a/native/annotator/types.h
+++ b/native/annotator/types.h
@@ -521,7 +521,7 @@
// Represents a result of Annotate call.
struct AnnotatedSpan {
- enum class Source { OTHER, KNOWLEDGE, DURATION, DATETIME };
+ enum class Source { OTHER, KNOWLEDGE, DURATION, DATETIME, PERSON_NAME };
// Unicode codepoint indices in the input string.
CodepointSpan span = {kInvalidIndex, kInvalidIndex};