| package com.fasterxml.jackson.core.io; |
| |
| import java.io.*; |
| |
| |
| /** |
| * Since JDK does not come with UTF-32/UCS-4, let's implement a simple |
| * decoder to use. |
| */ |
| public class UTF32Reader extends Reader |
| { |
| /** |
| * JSON actually limits available Unicode range in the high end |
| * to the same as xml (to basically limit UTF-8 max byte sequence |
| * length to 4) |
| */ |
| final protected static int LAST_VALID_UNICODE_CHAR = 0x10FFFF; |
| |
| final protected static char NC = (char) 0; |
| |
| final protected IOContext _context; |
| |
| protected InputStream _in; |
| |
| protected byte[] _buffer; |
| |
| protected int _ptr; |
| protected int _length; |
| |
| protected final boolean _bigEndian; |
| |
| /** |
| * Although input is fine with full Unicode set, Java still uses |
| * 16-bit chars, so we may have to split high-order chars into |
| * surrogate pairs. |
| */ |
| protected char _surrogate = NC; |
| |
| /** |
| * Total read character count; used for error reporting purposes |
| */ |
| protected int _charCount; |
| |
| /** |
| * Total read byte count; used for error reporting purposes |
| */ |
| protected int _byteCount; |
| |
| protected final boolean _managedBuffers; |
| |
| /* |
| /********************************************************** |
| /* Life-cycle |
| /********************************************************** |
| */ |
| |
| public UTF32Reader(IOContext ctxt, InputStream in, byte[] buf, int ptr, int len, boolean isBigEndian) { |
| _context = ctxt; |
| _in = in; |
| _buffer = buf; |
| _ptr = ptr; |
| _length = len; |
| _bigEndian = isBigEndian; |
| _managedBuffers = (in != null); |
| } |
| |
| /* |
| /********************************************************** |
| /* Public API |
| /********************************************************** |
| */ |
| |
| @Override |
| public void close() throws IOException { |
| InputStream in = _in; |
| |
| if (in != null) { |
| _in = null; |
| freeBuffers(); |
| in.close(); |
| } |
| } |
| |
| protected char[] _tmpBuf; |
| |
| /** |
| * Although this method is implemented by the base class, AND it should |
| * never be called by main code, let's still implement it bit more |
| * efficiently just in case |
| */ |
| @Override |
| public int read() throws IOException { |
| if (_tmpBuf == null) { |
| _tmpBuf = new char[1]; |
| } |
| if (read(_tmpBuf, 0, 1) < 1) { |
| return -1; |
| } |
| return _tmpBuf[0]; |
| } |
| |
| @Override |
| public int read(char[] cbuf, int start, int len) throws IOException { |
| // Already EOF? |
| if (_buffer == null) { return -1; } |
| if (len < 1) { return len; } |
| // Let's then ensure there's enough room... |
| if (start < 0 || (start+len) > cbuf.length) { |
| reportBounds(cbuf, start, len); |
| } |
| |
| int outPtr = start; |
| final int outEnd = len+start; |
| |
| // Ok, first; do we have a surrogate from last round? |
| if (_surrogate != NC) { |
| cbuf[outPtr++] = _surrogate; |
| _surrogate = NC; |
| // No need to load more, already got one char |
| } else { |
| // Note: we'll try to avoid blocking as much as possible. As a |
| // result, we only need to get 4 bytes for a full char. |
| int left = (_length - _ptr); |
| if (left < 4) { |
| if (!loadMore(left)) { // (legal) EOF? |
| // Ok if (but only if!) was at boundary |
| if (left == 0) { |
| return -1; |
| } |
| reportUnexpectedEOF(_length - _ptr, 4); |
| } |
| } |
| } |
| |
| // 02-Jun-2017, tatu: Must ensure we don't try to read past buffer end: |
| final int _lastValidInputStart = (_length - 3); |
| |
| main_loop: |
| while (outPtr < outEnd) { |
| int ptr = _ptr; |
| int hi, lo; |
| |
| if (_bigEndian) { |
| hi = (_buffer[ptr] << 8) | (_buffer[ptr+1] & 0xFF); |
| lo = ((_buffer[ptr+2] & 0xFF) << 8) | (_buffer[ptr+3] & 0xFF); |
| } else { |
| lo = (_buffer[ptr] & 0xFF) | ((_buffer[ptr+1] & 0xFF) << 8); |
| hi = (_buffer[ptr+2] & 0xFF)| (_buffer[ptr+3] << 8); |
| } |
| _ptr += 4; |
| |
| // Does it need to be split to surrogates? |
| // (also, we can and need to verify illegal chars) |
| if (hi != 0) { // need to split into surrogates? |
| hi &= 0xFFFF; // since it may be sign extended |
| int ch = ((hi - 1) << 16) | lo; // ch -= 0x10000; to normalize starting with 0x0 |
| if (hi > 0x10) { // last valid is 0x10FFFF |
| reportInvalid(ch, outPtr-start, |
| String.format(" (above 0x%08x)", LAST_VALID_UNICODE_CHAR)); |
| } |
| cbuf[outPtr++] = (char) (0xD800 + (ch >> 10)); |
| // hmmh. can this ever be 0? (not legal, at least?) |
| lo = (0xDC00 | (ch & 0x03FF)); |
| // Room for second part? |
| if (outPtr >= outEnd) { // nope |
| _surrogate = (char) ch; |
| break main_loop; |
| } |
| } |
| cbuf[outPtr++] = (char) lo; |
| if (_ptr > _lastValidInputStart) { |
| break main_loop; |
| } |
| } |
| int actualLen = (outPtr - start); |
| _charCount += actualLen; |
| return actualLen; |
| } |
| |
| /* |
| /********************************************************** |
| /* Internal methods |
| /********************************************************** |
| */ |
| |
| private void reportUnexpectedEOF(int gotBytes, int needed) throws IOException { |
| int bytePos = _byteCount + gotBytes, charPos = _charCount; |
| |
| throw new CharConversionException("Unexpected EOF in the middle of a 4-byte UTF-32 char: got "+gotBytes+", needed "+needed+", at char #"+charPos+", byte #"+bytePos+")"); |
| } |
| |
| private void reportInvalid(int value, int offset, String msg) throws IOException { |
| int bytePos = _byteCount + _ptr - 1, charPos = _charCount + offset; |
| |
| throw new CharConversionException("Invalid UTF-32 character 0x"+Integer.toHexString(value)+msg+" at char #"+charPos+", byte #"+bytePos+")"); |
| } |
| |
| /** |
| * @param available Number of "unused" bytes in the input buffer |
| * |
| * @return True, if enough bytes were read to allow decoding of at least |
| * one full character; false if EOF was encountered instead. |
| */ |
| private boolean loadMore(int available) throws IOException { |
| _byteCount += (_length - available); |
| |
| // Bytes that need to be moved to the beginning of buffer? |
| if (available > 0) { |
| if (_ptr > 0) { |
| System.arraycopy(_buffer, _ptr, _buffer, 0, available); |
| _ptr = 0; |
| } |
| _length = available; |
| } else { |
| /* Ok; here we can actually reasonably expect an EOF, |
| * so let's do a separate read right away: |
| */ |
| _ptr = 0; |
| int count = (_in == null) ? -1 : _in.read(_buffer); |
| if (count < 1) { |
| _length = 0; |
| if (count < 0) { // -1 |
| if (_managedBuffers) { |
| freeBuffers(); // to help GC? |
| } |
| return false; |
| } |
| // 0 count is no good; let's err out |
| reportStrangeStream(); |
| } |
| _length = count; |
| } |
| |
| /* Need at least 4 bytes; if we don't get that many, it's an |
| * error. |
| */ |
| while (_length < 4) { |
| int count = (_in == null) ? -1 : _in.read(_buffer, _length, _buffer.length - _length); |
| if (count < 1) { |
| if (count < 0) { // -1, EOF... no good! |
| if (_managedBuffers) { |
| freeBuffers(); // to help GC? |
| } |
| reportUnexpectedEOF(_length, 4); |
| } |
| // 0 count is no good; let's err out |
| reportStrangeStream(); |
| } |
| _length += count; |
| } |
| return true; |
| } |
| |
| /** |
| * This method should be called along with (or instead of) normal |
| * close. After calling this method, no further reads should be tried. |
| * Method will try to recycle read buffers (if any). |
| */ |
| private void freeBuffers() { |
| byte[] buf = _buffer; |
| if (buf != null) { |
| _buffer = null; |
| _context.releaseReadIOBuffer(buf); |
| } |
| } |
| |
| private void reportBounds(char[] cbuf, int start, int len) throws IOException { |
| throw new ArrayIndexOutOfBoundsException("read(buf,"+start+","+len+"), cbuf["+cbuf.length+"]"); |
| } |
| |
| private void reportStrangeStream() throws IOException { |
| throw new IOException("Strange I/O stream, returned 0 bytes on read"); |
| } |
| } |