blob: d9b0603a4be514c80e1fd53da52f30b93378b1a1 [file] [log] [blame]
package com.fasterxml.jackson.core.sym;
import java.util.Arrays;
import java.util.BitSet;
import java.util.concurrent.atomic.AtomicReference;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.util.InternCache;
/**
* This class is a kind of specialized type-safe Map, from char array to
* String value. Specialization means that in addition to type-safety
* and specific access patterns (key char array, Value optionally interned
* String; values added on access if necessary), and that instances are
* meant to be used concurrently, but by using well-defined mechanisms
* to obtain such concurrently usable instances. Main use for the class
* is to store symbol table information for things like compilers and
* parsers; especially when number of symbols (keywords) is limited.
*<p>
* For optimal performance, usage pattern should be one where matches
* should be very common (especially after "warm-up"), and as with most hash-based
* maps/sets, that hash codes are uniformly distributed. Also, collisions
* are slightly more expensive than with HashMap or HashSet, since hash codes
* are not used in resolving collisions; that is, equals() comparison is
* done with all symbols in same bucket index.<br>
* Finally, rehashing is also more expensive, as hash codes are not
* stored; rehashing requires all entries' hash codes to be recalculated.
* Reason for not storing hash codes is reduced memory usage, hoping
* for better memory locality.
*<p>
* Usual usage pattern is to create a single "master" instance, and either
* use that instance in sequential fashion, or to create derived "child"
* instances, which after use, are asked to return possible symbol additions
* to master instance. In either case benefit is that symbol table gets
* initialized so that further uses are more efficient, as eventually all
* symbols needed will already be in symbol table. At that point no more
* Symbol String allocations are needed, nor changes to symbol table itself.
*<p>
* Note that while individual SymbolTable instances are NOT thread-safe
* (much like generic collection classes), concurrently used "child"
* instances can be freely used without synchronization. However, using
* master table concurrently with child instances can only be done if
* access to master instance is read-only (i.e. no modifications done).
*/
public final class CharsToNameCanonicalizer
{
/* If we use "multiply-add" based hash algorithm, this is the multiplier
* we use.
*<p>
* Note that JDK uses 31; but it seems that 33 produces fewer collisions,
* at least with tests we have.
*/
public final static int HASH_MULT = 33;
/**
* Default initial table size. Shouldn't be miniscule (as there's
* cost to both array realloc and rehashing), but let's keep
* it reasonably small. For systems that properly
* reuse factories it doesn't matter either way; but when
* recreating factories often, initial overhead may dominate.
*/
private static final int DEFAULT_T_SIZE = 64;
/**
* Let's not expand symbol tables past some maximum size;
* this should protected against OOMEs caused by large documents
* with unique (~= random) names.
*/
private static final int MAX_T_SIZE = 0x10000; // 64k entries == 256k mem
/**
* Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k;
* this corresponds to 64k main hash index. This should allow for enough distinct
* names for almost any case.
*/
static final int MAX_ENTRIES_FOR_REUSE = 12000;
/**
* Also: to thwart attacks based on hash collisions (which may or may not
* be cheap to calculate), we will need to detect "too long"
* collision chains. Let's start with static value of 255 entries
* for the longest legal chain.
*<p>
* Note: longest chain we have been able to produce without malicious
* intent has been 38 (with "com.fasterxml.jackson.core.main.TestWithTonsaSymbols");
* our setting should be reasonable here.
*<p>
* Also note that value was lowered from 255 (2.3 and earlier) to 100 for 2.4
*
* @since 2.1
*/
static final int MAX_COLL_CHAIN_LENGTH = 100;
/*
/**********************************************************
/* Configuration
/**********************************************************
*/
/**
* Sharing of learnt symbols is done by optional linking of symbol
* table instances with their parents. When parent linkage is
* defined, and child instance is released (call to <code>release</code>),
* parent's shared tables may be updated from the child instance.
*/
final private CharsToNameCanonicalizer _parent;
/**
* Member that is only used by the root table instance: root
* passes immutable state info child instances, and children
* may return new state if they add entries to the table.
* Child tables do NOT use the reference.
*/
final private AtomicReference<TableInfo> _tableInfo;
/**
* Seed value we use as the base to make hash codes non-static between
* different runs, but still stable for lifetime of a single symbol table
* instance.
* This is done for security reasons, to avoid potential DoS attack via
* hash collisions.
*
* @since 2.1
*/
final private int _seed;
final private int _flags;
/**
* Whether any canonicalization should be attempted (whether using
* intern or not.
*<p>
* NOTE: non-final since we may need to disable this with overflow.
*/
private boolean _canonicalize;
/*
/**********************************************************
/* Actual symbol table data
/**********************************************************
*/
/**
* Primary matching symbols; it's expected most match occur from
* here.
*/
private String[] _symbols;
/**
* Overflow buckets; if primary doesn't match, lookup is done
* from here.
*<p>
* Note: Number of buckets is half of number of symbol entries, on
* assumption there's less need for buckets.
*/
private Bucket[] _buckets;
/**
* Current size (number of entries); needed to know if and when
* rehash.
*/
private int _size;
/**
* Limit that indicates maximum size this instance can hold before
* it needs to be expanded and rehashed. Calculated using fill
* factor passed in to constructor.
*/
private int _sizeThreshold;
/**
* Mask used to get index from hash values; equal to
* <code>_buckets.length - 1</code>, when _buckets.length is
* a power of two.
*/
private int _indexMask;
/**
* We need to keep track of the longest collision list; this is needed
* both to indicate problems with attacks and to allow flushing for
* other cases.
*
* @since 2.1
*/
private int _longestCollisionList;
/*
/**********************************************************
/* State regarding shared arrays
/**********************************************************
*/
/**
* Flag that indicates whether underlying data structures for
* the main hash area are shared or not. If they are, then they
* need to be handled in copy-on-write way, i.e. if they need
* to be modified, a copy needs to be made first; at this point
* it will not be shared any more, and can be modified.
*<p>
* This flag needs to be checked both when adding new main entries,
* and when adding new collision list queues (i.e. creating a new
* collision list head entry)
*/
private boolean _hashShared;
/*
/**********************************************************
/* Bit of DoS detection goodness
/**********************************************************
*/
/**
* Lazily constructed structure that is used to keep track of
* collision buckets that have overflowed once: this is used
* to detect likely attempts at denial-of-service attacks that
* uses hash collisions.
*
* @since 2.4
*/
private BitSet _overflows;
/*
/**********************************************************
/* Life-cycle: constructors
/**********************************************************
*/
/**
* Main method for constructing a root symbol table instance.
*/
private CharsToNameCanonicalizer(int seed)
{
_parent = null;
_seed = seed;
// these settings don't really matter for the bootstrap instance
_canonicalize = true;
_flags = -1;
// And we'll also set flags so no copying of buckets is needed:
_hashShared = false; // doesn't really matter for root instance
_longestCollisionList = 0;
_tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(DEFAULT_T_SIZE));
// and actually do NOT assign buffers so we'll find if anyone tried to
// use root instance
}
/**
* Internal constructor used when creating child instances.
*/
private CharsToNameCanonicalizer(CharsToNameCanonicalizer parent, int flags, int seed,
TableInfo parentState)
{
_parent = parent;
_seed = seed;
_tableInfo = null; // not used by child tables
_flags = flags;
_canonicalize = JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(flags);
// Then copy shared state
_symbols = parentState.symbols;
_buckets = parentState.buckets;
_size = parentState.size;
_longestCollisionList = parentState.longestCollisionList;
// Hard-coded fill factor, 75%
int arrayLen = (_symbols.length);
_sizeThreshold = _thresholdSize(arrayLen);
_indexMask = (arrayLen - 1);
// Need to make copies of arrays, if/when adding new entries
_hashShared = true;
}
private static int _thresholdSize(int hashAreaSize) { return hashAreaSize - (hashAreaSize >> 2); }
/*
/**********************************************************
/* Life-cycle: factory methods, merging
/**********************************************************
*/
/**
* Method called to create root canonicalizer for a {@link com.fasterxml.jackson.core.JsonFactory}
* instance. Root instance is never used directly; its main use is for
* storing and sharing underlying symbol arrays as needed.
*/
public static CharsToNameCanonicalizer createRoot() {
// Need to use a variable seed, to thwart hash-collision based attacks.
// 14-Feb-2017, tatu: not sure it actually helps, at all, since it won't
// change mixing or any of the steps. Should likely just remove in future.
long now = System.currentTimeMillis();
// ensure it's not 0; and might as well require to be odd so:
int seed = (((int) now) + ((int) (now >>> 32))) | 1;
return createRoot(seed);
}
protected static CharsToNameCanonicalizer createRoot(int seed) {
return new CharsToNameCanonicalizer(seed);
}
/**
* "Factory" method; will create a new child instance of this symbol
* table. It will be a copy-on-write instance, ie. it will only use
* read-only copy of parent's data, but when changes are needed, a
* copy will be created.
*<p>
* Note: while this method is synchronized, it is generally not
* safe to both use makeChild/mergeChild, AND to use instance
* actively. Instead, a separate 'root' instance should be used
* on which only makeChild/mergeChild are called, but instance itself
* is not used as a symbol table.
*/
public CharsToNameCanonicalizer makeChild(int flags) {
return new CharsToNameCanonicalizer(this, flags, _seed, _tableInfo.get());
}
/**
* Method called by the using code to indicate it is done with this instance.
* This lets instance merge accumulated changes into parent (if need be),
* safely and efficiently, and without calling code having to know about parent
* information.
*/
public void release() {
// If nothing has been added, nothing to do
if (!maybeDirty()) { return; }
// we will try to merge if child table has new entries
if (_parent != null && _canonicalize) { // canonicalize set to false if max size was reached
_parent.mergeChild(new TableInfo(this));
// Let's also mark this instance as dirty, so that just in
// case release was too early, there's no corruption of possibly shared data.
_hashShared = true;
}
}
/**
* Method that allows contents of child table to potentially be
* "merged in" with contents of this symbol table.
*<p>
* Note that caller has to make sure symbol table passed in is
* really a child or sibling of this symbol table.
*/
private void mergeChild(TableInfo childState)
{
final int childCount = childState.size;
TableInfo currState = _tableInfo.get();
// Should usually grow; but occasionally could also shrink if (but only if)
// collision list overflow ends up clearing some collision lists.
if (childCount == currState.size) {
return;
}
// One caveat: let's try to avoid problems with degenerate cases of documents with
// generated "random" names: for these, symbol tables would bloat indefinitely.
// One way to do this is to just purge tables if they grow
// too large, and that's what we'll do here.
if (childCount > MAX_ENTRIES_FOR_REUSE) {
// At any rate, need to clean up the tables
childState = TableInfo.createInitial(DEFAULT_T_SIZE);
}
_tableInfo.compareAndSet(currState, childState);
}
/*
/**********************************************************
/* Public API, generic accessors:
/**********************************************************
*/
public int size() {
if (_tableInfo != null) { // root table
return _tableInfo.get().size;
}
// nope, child table
return _size;
}
/**
* Method for checking number of primary hash buckets this symbol
* table uses.
*
* @since 2.1
*/
public int bucketCount() { return _symbols.length; }
public boolean maybeDirty() { return !_hashShared; }
public int hashSeed() { return _seed; }
/**
* Method mostly needed by unit tests; calculates number of
* entries that are in collision list. Value can be at most
* ({@link #size} - 1), but should usually be much lower, ideally 0.
*
* @since 2.1
*/
public int collisionCount() {
int count = 0;
for (Bucket bucket : _buckets) {
if (bucket != null) {
count += bucket.length;
}
}
return count;
}
/**
* Method mostly needed by unit tests; calculates length of the
* longest collision chain. This should typically be a low number,
* but may be up to {@link #size} - 1 in the pathological case
*
* @since 2.1
*/
public int maxCollisionLength() { return _longestCollisionList; }
/*
/**********************************************************
/* Public API, accessing symbols:
/**********************************************************
*/
public String findSymbol(char[] buffer, int start, int len, int h)
{
if (len < 1) { // empty Strings are simplest to handle up front
return "";
}
if (!_canonicalize) { // [JACKSON-259]
return new String(buffer, start, len);
}
/* Related to problems with sub-standard hashing (somewhat
* relevant for collision attacks too), let's try little
* bit of shuffling to improve hash codes.
* (note, however, that this can't help with full collisions)
*/
int index = _hashToIndex(h);
String sym = _symbols[index];
// Optimal case; checking existing primary symbol for hash index:
if (sym != null) {
// Let's inline primary String equality checking:
if (sym.length() == len) {
int i = 0;
while (sym.charAt(i) == buffer[start+i]) {
// Optimal case; primary match found
if (++i == len) {
return sym;
}
}
}
Bucket b = _buckets[index>>1];
if (b != null) {
sym = b.has(buffer, start, len);
if (sym != null) {
return sym;
}
sym = _findSymbol2(buffer, start, len, b.next);
if (sym != null) {
return sym;
}
}
}
return _addSymbol(buffer, start, len, h, index);
}
private String _findSymbol2(char[] buffer, int start, int len, Bucket b) {
while (b != null) {
String sym = b.has(buffer, start, len);
if (sym != null) {
return sym;
}
b = b.next;
}
return null;
}
private String _addSymbol(char[] buffer, int start, int len, int h, int index)
{
if (_hashShared) { //need to do copy-on-write?
copyArrays();
_hashShared = false;
} else if (_size >= _sizeThreshold) { // Need to expand?
rehash();
/* Need to recalc hash; rare occurence (index mask has been
* recalculated as part of rehash)
*/
index = _hashToIndex(calcHash(buffer, start, len));
}
String newSymbol = new String(buffer, start, len);
if (JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(_flags)) {
newSymbol = InternCache.instance.intern(newSymbol);
}
++_size;
// Ok; do we need to add primary entry, or a bucket?
if (_symbols[index] == null) {
_symbols[index] = newSymbol;
} else {
final int bix = (index >> 1);
Bucket newB = new Bucket(newSymbol, _buckets[bix]);
int collLen = newB.length;
if (collLen > MAX_COLL_CHAIN_LENGTH) {
// 23-May-2014, tatu: Instead of throwing an exception right away,
// let's handle in bit smarter way.
_handleSpillOverflow(bix, newB);
} else {
_buckets[bix] = newB;
_longestCollisionList = Math.max(collLen, _longestCollisionList);
}
}
return newSymbol;
}
private void _handleSpillOverflow(int bindex, Bucket newBucket)
{
if (_overflows == null) {
_overflows = new BitSet();
_overflows.set(bindex);
} else {
if (_overflows.get(bindex)) {
// Has happened once already, so not a coincident...
if (JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(_flags)) {
reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH);
}
// but even if we don't fail, we will stop canonicalizing:
_canonicalize = false;
} else {
_overflows.set(bindex);
}
}
// regardless, if we get this far, clear up the bucket, adjust size appropriately.
_symbols[bindex + bindex] = newBucket.symbol;
_buckets[bindex] = null;
// newBucket contains new symbol; but we wil
_size -= (newBucket.length);
// we could calculate longest; but for now just mark as invalid
_longestCollisionList = -1;
}
/**
* Helper method that takes in a "raw" hash value, shuffles it as necessary,
* and truncates to be used as the index.
*/
public int _hashToIndex(int rawHash) {
// doing these seems to help a bit
rawHash += (rawHash >>> 15);
rawHash ^= (rawHash << 7);
rawHash += (rawHash >>> 3);
return (rawHash & _indexMask);
}
/**
* Implementation of a hashing method for variable length
* Strings. Most of the time intention is that this calculation
* is done by caller during parsing, not here; however, sometimes
* it needs to be done for parsed "String" too.
*
* @param len Length of String; has to be at least 1 (caller guarantees
* this pre-condition)
*/
public int calcHash(char[] buffer, int start, int len) {
int hash = _seed;
for (int i = start, end = start+len; i < end; ++i) {
hash = (hash * HASH_MULT) + (int) buffer[i];
}
// NOTE: shuffling, if any, is done in 'findSymbol()', not here:
return (hash == 0) ? 1 : hash;
}
public int calcHash(String key)
{
final int len = key.length();
int hash = _seed;
for (int i = 0; i < len; ++i) {
hash = (hash * HASH_MULT) + (int) key.charAt(i);
}
// NOTE: shuffling, if any, is done in 'findSymbol()', not here:
return (hash == 0) ? 1 : hash;
}
/*
/**********************************************************
/* Internal methods
/**********************************************************
*/
/**
* Method called when copy-on-write is needed; generally when first
* change is made to a derived symbol table.
*/
private void copyArrays() {
final String[] oldSyms = _symbols;
_symbols = Arrays.copyOf(oldSyms, oldSyms.length);
final Bucket[] oldBuckets = _buckets;
_buckets = Arrays.copyOf(oldBuckets, oldBuckets.length);
}
/**
* Method called when size (number of entries) of symbol table grows
* so big that load factor is exceeded. Since size has to remain
* power of two, arrays will then always be doubled. Main work
* is really redistributing old entries into new String/Bucket
* entries.
*/
private void rehash() {
int size = _symbols.length;
int newSize = size + size;
/* 12-Mar-2010, tatu: Let's actually limit maximum size we are
* prepared to use, to guard against OOME in case of unbounded
* name sets (unique [non-repeating] names)
*/
if (newSize > MAX_T_SIZE) {
// If this happens, there's no point in either growing or shrinking hash areas.
// Rather, let's just cut our losses and stop canonicalizing.
_size = 0;
_canonicalize = false;
// in theory, could just leave these as null, but...
_symbols = new String[DEFAULT_T_SIZE];
_buckets = new Bucket[DEFAULT_T_SIZE>>1];
_indexMask = DEFAULT_T_SIZE-1;
_hashShared = false;
return;
}
String[] oldSyms = _symbols;
Bucket[] oldBuckets = _buckets;
_symbols = new String[newSize];
_buckets = new Bucket[newSize >> 1];
// Let's update index mask, threshold, now (needed for rehashing)
_indexMask = newSize - 1;
_sizeThreshold = _thresholdSize(newSize);
int count = 0; // let's do sanity check
// Need to do two loops, unfortunately, since spill-over area is
// only half the size:
int maxColl = 0;
for (int i = 0; i < size; ++i) {
String symbol = oldSyms[i];
if (symbol != null) {
++count;
int index = _hashToIndex(calcHash(symbol));
if (_symbols[index] == null) {
_symbols[index] = symbol;
} else {
int bix = (index >> 1);
Bucket newB = new Bucket(symbol, _buckets[bix]);
_buckets[bix] = newB;
maxColl = Math.max(maxColl, newB.length);
}
}
}
size >>= 1;
for (int i = 0; i < size; ++i) {
Bucket b = oldBuckets[i];
while (b != null) {
++count;
String symbol = b.symbol;
int index = _hashToIndex(calcHash(symbol));
if (_symbols[index] == null) {
_symbols[index] = symbol;
} else {
int bix = (index >> 1);
Bucket newB = new Bucket(symbol, _buckets[bix]);
_buckets[bix] = newB;
maxColl = Math.max(maxColl, newB.length);
}
b = b.next;
}
}
_longestCollisionList = maxColl;
_overflows = null;
if (count != _size) {
throw new IllegalStateException(String.format(
"Internal error on SymbolTable.rehash(): had %d entries; now have %d",
_size, count));
}
}
/**
* @since 2.1
*/
protected void reportTooManyCollisions(int maxLen) {
throw new IllegalStateException("Longest collision chain in symbol table (of size "+_size
+") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions");
}
// For debugging, comment out
/*
@Override
public String toString()
{
StringBuilder sb = new StringBuilder();
int primaryCount = 0;
for (String s : _symbols) {
if (s != null) ++primaryCount;
}
sb.append("[BytesToNameCanonicalizer, size: ");
sb.append(_size);
sb.append('/');
sb.append(_symbols.length);
sb.append(", ");
sb.append(primaryCount);
sb.append('/');
sb.append(_size - primaryCount);
sb.append(" coll; avg length: ");
// Average length: minimum of 1 for all (1 == primary hit);
// and then 1 per each traversal for collisions/buckets
//int maxDist = 1;
int pathCount = _size;
for (Bucket b : _buckets) {
if (b != null) {
int spillLen = b.length;
for (int j = 1; j <= spillLen; ++j) {
pathCount += j;
}
}
}
double avgLength;
if (_size == 0) {
avgLength = 0.0;
} else {
avgLength = (double) pathCount / (double) _size;
}
// let's round up a bit (two 2 decimal places)
//avgLength -= (avgLength % 0.01);
sb.append(avgLength);
sb.append(']');
return sb.toString();
}
*/
/*
/**********************************************************
/* Helper classes
/**********************************************************
*/
/**
* This class is a symbol table entry. Each entry acts as a node
* in a linked list.
*/
static final class Bucket
{
public final String symbol;
public final Bucket next;
public final int length;
public Bucket(String s, Bucket n) {
symbol = s;
next = n;
length = (n == null) ? 1 : n.length+1;
}
public String has(char[] buf, int start, int len) {
if (symbol.length() != len) {
return null;
}
int i = 0;
do {
if (symbol.charAt(i) != buf[start+i]) {
return null;
}
} while (++i < len);
return symbol;
}
}
/**
* Immutable value class used for sharing information as efficiently
* as possible, by only require synchronization of reference manipulation
* but not access to contents.
*
* @since 2.8.7
*/
private final static class TableInfo
{
final int size;
final int longestCollisionList;
final String[] symbols;
final Bucket[] buckets;
public TableInfo(int size, int longestCollisionList,
String[] symbols, Bucket[] buckets)
{
this.size = size;
this.longestCollisionList = longestCollisionList;
this.symbols = symbols;
this.buckets = buckets;
}
public TableInfo(CharsToNameCanonicalizer src)
{
this.size = src._size;
this.longestCollisionList = src._longestCollisionList;
this.symbols = src._symbols;
this.buckets = src._buckets;
}
public static TableInfo createInitial(int sz) {
return new TableInfo(0,
0, // longestCollisionList
new String[sz], new Bucket[sz >> 1]);
}
}
}