blob: 4c616fc13d54df1cc6a30f1c942d3e7336e26061 [file] [log] [blame]
package com.fasterxml.jackson.core.io;
import java.util.Arrays;
public final class CharTypes
{
private final static char[] HC = "0123456789ABCDEF".toCharArray();
private final static byte[] HB;
static {
int len = HC.length;
HB = new byte[len];
for (int i = 0; i < len; ++i) {
HB[i] = (byte) HC[i];
}
}
/**
* Lookup table used for determining which input characters
* need special handling when contained in text segment.
*/
private final static int[] sInputCodes;
static {
/* 96 would do for most cases (backslash is ASCII 94)
* but if we want to do lookups by raw bytes it's better
* to have full table
*/
final int[] table = new int[256];
// Control chars and non-space white space are not allowed unquoted
for (int i = 0; i < 32; ++i) {
table[i] = -1;
}
// And then string end and quote markers are special too
table['"'] = 1;
table['\\'] = 1;
sInputCodes = table;
}
/**
* Additionally we can combine UTF-8 decoding info into similar
* data table.
*/
private final static int[] sInputCodesUTF8;
static {
final int[] table = new int[sInputCodes.length];
System.arraycopy(sInputCodes, 0, table, 0, table.length);
for (int c = 128; c < 256; ++c) {
int code;
// We'll add number of bytes needed for decoding
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
code = 2;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
code = 3;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
code = 4;
} else {
// And -1 seems like a good "universal" error marker...
code = -1;
}
table[c] = code;
}
sInputCodesUTF8 = table;
}
/**
* To support non-default (and -standard) unquoted field names mode,
* need to have alternate checking.
* Basically this is list of 8-bit ASCII characters that are legal
* as part of Javascript identifier
*/
private final static int[] sInputCodesJsNames;
static {
final int[] table = new int[256];
// Default is "not a name char", mark ones that are
Arrays.fill(table, -1);
// Assume rules with JS same as Java (change if/as needed)
for (int i = 33; i < 256; ++i) {
if (Character.isJavaIdentifierPart((char) i)) {
table[i] = 0;
}
}
/* As per [JACKSON-267], '@', '#' and '*' are also to be accepted as well.
* And '-' (for hyphenated names); and '+' for sake of symmetricity...
*/
table['@'] = 0;
table['#'] = 0;
table['*'] = 0;
table['-'] = 0;
table['+'] = 0;
sInputCodesJsNames = table;
}
/**
* This table is similar to Latin-1, except that it marks all "high-bit"
* code as ok. They will be validated at a later point, when decoding
* name
*/
private final static int[] sInputCodesUtf8JsNames;
static {
final int[] table = new int[256];
// start with 8-bit JS names
System.arraycopy(sInputCodesJsNames, 0, table, 0, table.length);
Arrays.fill(table, 128, 128, 0);
sInputCodesUtf8JsNames = table;
}
/**
* Decoding table used to quickly determine characters that are
* relevant within comment content.
*/
private final static int[] sInputCodesComment;
static {
final int[] buf = new int[256];
// but first: let's start with UTF-8 multi-byte markers:
System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128);
// default (0) means "ok" (skip); -1 invalid, others marked by char itself
Arrays.fill(buf, 0, 32, -1); // invalid white space
buf['\t'] = 0; // tab is still fine
buf['\n'] = '\n'; // lf/cr need to be observed, ends cpp comment
buf['\r'] = '\r';
buf['*'] = '*'; // end marker for c-style comments
sInputCodesComment = buf;
}
/**
* Decoding table used for skipping white space and comments.
*
* @since 2.3
*/
private final static int[] sInputCodesWS;
static {
// but first: let's start with UTF-8 multi-byte markers:
final int[] buf = new int[256];
System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128);
// default (0) means "not whitespace" (end); 1 "whitespace", -1 invalid,
// 2-4 UTF-8 multi-bytes, others marked by char itself
//
Arrays.fill(buf, 0, 32, -1); // invalid white space
buf[' '] = 1;
buf['\t'] = 1;
buf['\n'] = '\n'; // lf/cr need to be observed, ends cpp comment
buf['\r'] = '\r';
buf['/'] = '/'; // start marker for c/cpp comments
buf['#'] = '#'; // start marker for YAML comments
sInputCodesWS = buf;
}
/**
* Lookup table used for determining which output characters in
* 7-bit ASCII range need to be quoted.
*/
private final static int[] sOutputEscapes128;
static {
int[] table = new int[128];
// Control chars need generic escape sequence
for (int i = 0; i < 32; ++i) {
// 04-Mar-2011, tatu: Used to use "-(i + 1)", replaced with constant
table[i] = CharacterEscapes.ESCAPE_STANDARD;
}
/* Others (and some within that range too) have explicit shorter
* sequences
*/
table['"'] = '"';
table['\\'] = '\\';
// Escaping of slash is optional, so let's not add it
table[0x08] = 'b';
table[0x09] = 't';
table[0x0C] = 'f';
table[0x0A] = 'n';
table[0x0D] = 'r';
sOutputEscapes128 = table;
}
/**
* Lookup table for the first 128 Unicode characters (7-bit ASCII)
* range. For actual hex digits, contains corresponding value;
* for others -1.
*/
private final static int[] sHexValues = new int[128];
static {
Arrays.fill(sHexValues, -1);
for (int i = 0; i < 10; ++i) {
sHexValues['0' + i] = i;
}
for (int i = 0; i < 6; ++i) {
sHexValues['a' + i] = 10 + i;
sHexValues['A' + i] = 10 + i;
}
}
public static int[] getInputCodeLatin1() { return sInputCodes; }
public static int[] getInputCodeUtf8() { return sInputCodesUTF8; }
public static int[] getInputCodeLatin1JsNames() { return sInputCodesJsNames; }
public static int[] getInputCodeUtf8JsNames() { return sInputCodesUtf8JsNames; }
public static int[] getInputCodeComment() { return sInputCodesComment; }
public static int[] getInputCodeWS() { return sInputCodesWS; }
/**
* Accessor for getting a read-only encoding table for first 128 Unicode
* code points (single-byte UTF-8 characters).
* Value of 0 means "no escaping"; other positive values that value is character
* to use after backslash; and negative values that generic (backslash - u)
* escaping is to be used.
*/
public static int[] get7BitOutputEscapes() { return sOutputEscapes128; }
public static int charToHex(int ch)
{
return (ch > 127) ? -1 : sHexValues[ch];
}
public static void appendQuoted(StringBuilder sb, String content)
{
final int[] escCodes = sOutputEscapes128;
int escLen = escCodes.length;
for (int i = 0, len = content.length(); i < len; ++i) {
char c = content.charAt(i);
if (c >= escLen || escCodes[c] == 0) {
sb.append(c);
continue;
}
sb.append('\\');
int escCode = escCodes[c];
if (escCode < 0) { // generic quoting (hex value)
// The only negative value sOutputEscapes128 returns
// is CharacterEscapes.ESCAPE_STANDARD, which mean
// appendQuotes should encode using the Unicode encoding;
// not sure if this is the right way to encode for
// CharacterEscapes.ESCAPE_CUSTOM or other (future)
// CharacterEscapes.ESCAPE_XXX values.
// We know that it has to fit in just 2 hex chars
sb.append('u');
sb.append('0');
sb.append('0');
int value = c; // widening
sb.append(HC[value >> 4]);
sb.append(HC[value & 0xF]);
} else { // "named", i.e. prepend with slash
sb.append((char) escCode);
}
}
}
public static char[] copyHexChars() {
return (char[]) HC.clone();
}
public static byte[] copyHexBytes() {
return (byte[]) HB.clone();
}
}