|
3 | 3 | import com.muukong.util.Util; |
4 | 4 |
|
5 | 5 | import java.math.BigInteger; |
| 6 | +import java.nio.ByteBuffer; |
| 7 | +import java.nio.charset.CharacterCodingException; |
| 8 | +import java.nio.charset.CharsetDecoder; |
| 9 | +import java.nio.charset.CodingErrorAction; |
| 10 | +import java.nio.charset.StandardCharsets; |
6 | 11 |
|
7 | 12 | /** |
8 | 13 | * A disassembler for protobuf messages that on input a protobuf message attempts to disassemble it |
@@ -215,35 +220,46 @@ private ISerializable disassembleLen(int fieldNumber) { |
215 | 220 |
|
216 | 221 | final int length = disassembleVarInt().toInt(); // We can (rather) safely assume that the length fits into an int |
217 | 222 |
|
218 | | - int printableTokens = 0; |
219 | | - for ( int i = 0; i < length; ++i ) { |
220 | | - byte token = input[cursor + i]; |
221 | | - if ( Util.isPrintable(token) ) |
222 | | - ++printableTokens; |
| 223 | + // An empty LEN field is unambiguously an empty string — skip all heuristics. |
| 224 | + if ( length == 0 ) { |
| 225 | + return new PBString(""); |
223 | 226 | } |
224 | 227 |
|
225 | 228 | /* |
226 | 229 | The following heuristic is employed: |
227 | | - (1) If all characters are printable, disassemble it as a string and exit. |
| 230 | + (1) If all characters are printable UTF-8 characters, disassemble it as a string and exit. |
228 | 231 | (2) Otherwise, attempt to parse bytes as sub-message. Exit on success. |
229 | 232 | (3) Otherwise, attempt to parse message as packed repeated fields. Exit on success. |
230 | 233 | (4) Otherwise, consume `length` bytes (this should always succeed) and continue |
231 | 234 | */ |
232 | 235 |
|
233 | | - final boolean isString = printableTokens == length; // TODO: can this heuristic be improved? |
234 | | - |
235 | | - // Case (1) |
236 | | - if ( isString ) { |
237 | | - |
238 | | - StringBuilder sb = new StringBuilder(); |
239 | | - for ( int i = 0; i < length; ++i ) { |
240 | | - byte b = input[cursor + i]; |
241 | | - sb.append((char) b); |
| 236 | + // Case (1): try to decode as UTF-8 and verify all characters are printable |
| 237 | + boolean isString = false; |
| 238 | + String stringValue = null; |
| 239 | + { |
| 240 | + CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() |
| 241 | + .onMalformedInput(CodingErrorAction.REPORT) |
| 242 | + .onUnmappableCharacter(CodingErrorAction.REPORT); |
| 243 | + try { |
| 244 | + stringValue = decoder.decode(ByteBuffer.wrap(input, cursor, length)).toString(); |
| 245 | + // Reject strings that contain non-printable control characters |
| 246 | + // (allow common whitespace: tab=0x09, LF=0x0A, CR=0x0D) |
| 247 | + isString = true; |
| 248 | + for ( int i = 0; i < stringValue.length(); ++i ) { |
| 249 | + char c = stringValue.charAt(i); |
| 250 | + if ( c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F ) { |
| 251 | + isString = false; |
| 252 | + break; |
| 253 | + } |
| 254 | + } |
| 255 | + } catch ( CharacterCodingException e ) { |
| 256 | + isString = false; |
242 | 257 | } |
| 258 | + } |
243 | 259 |
|
| 260 | + if ( isString ) { |
244 | 261 | cursor += length; |
245 | | - |
246 | | - return new PBString( sb.toString() ); |
| 262 | + return new PBString( stringValue ); |
247 | 263 | } |
248 | 264 |
|
249 | 265 | // Case (2) |
|
0 commit comments