From 00e65bc638dc3fe1e7ab01fba5ddfcdbacbedaee Mon Sep 17 00:00:00 2001 From: Ge Gao Date: Sat, 21 Sep 2024 13:35:59 -0700 Subject: [PATCH] Change unicode escaping in JSON (#10887) Summary: Change how unicode is escaped in JSON Text. The objective is to make it consistent with Presto and make a canonical JSON representation. The implementation is consistent with Java jackson 2.11.0 Utf8Generator Pull Request resolved: https://github.com/facebookincubator/velox/pull/10887 Reviewed By: Yuhta Differential Revision: D62591195 Pulled By: gggrace14 fbshipit-source-id: 25235f97e371103197e522d5ffb5c090a7e30888 --- velox/functions/lib/RegistrationHelpers.h | 1 - velox/functions/lib/Utf8Utils.h | 31 +++ velox/functions/prestosql/json/CMakeLists.txt | 1 + .../prestosql/json/JsonStringUtil.cpp | 184 ++++++++++++++++++ .../functions/prestosql/json/JsonStringUtil.h | 54 +++++ .../prestosql/tests/JsonCastTest.cpp | 72 +++++-- velox/functions/prestosql/types/JsonType.cpp | 17 +- 7 files changed, 335 insertions(+), 25 deletions(-) create mode 100644 velox/functions/prestosql/json/JsonStringUtil.cpp create mode 100644 velox/functions/prestosql/json/JsonStringUtil.h diff --git a/velox/functions/lib/RegistrationHelpers.h b/velox/functions/lib/RegistrationHelpers.h index b5f05d91180e..3b7f55c55b7b 100644 --- a/velox/functions/lib/RegistrationHelpers.h +++ b/velox/functions/lib/RegistrationHelpers.h @@ -16,7 +16,6 @@ #pragma once #include "velox/functions/Registerer.h" -#include "velox/functions/prestosql/types/TimestampWithTimeZoneType.h" namespace facebook::velox::functions { namespace { diff --git a/velox/functions/lib/Utf8Utils.h b/velox/functions/lib/Utf8Utils.h index 11cd3404addd..369e1151e93f 100644 --- a/velox/functions/lib/Utf8Utils.h +++ b/velox/functions/lib/Utf8Utils.h @@ -17,6 +17,10 @@ #include +#include "folly/CPortability.h" + +#include "velox/common/base/Exceptions.h" + namespace facebook::velox::functions { /// This function is not part of the original utf8proc. @@ -48,4 +52,31 @@ namespace facebook::velox::functions { /// https://github.com/airlift/slice/blob/master/src/main/java/io/airlift/slice/SliceUtf8.java int32_t tryGetCharLength(const char* input, int64_t size); +/// Return the length in byte of the next UTF-8 encoded character at the +/// beginning of `string`. If the beginning of `string` is not valid UTF-8 +/// encoding, return -1. +FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length( + const unsigned char* string, + const unsigned char* end) { + VELOX_DCHECK(string < end, "Expect non-empty string."); + + if ((*string & 0x80u) == 0) { + return 1; + } + if ((*string & 0xE0u) == 0xC0u && (string + 1) < end && + (*(string + 1) & 0xC0u) == 0x80u) { + return 2; + } + if ((*string & 0xF0u) == 0xE0u && (string + 2) < end && + (*(string + 1) & 0xC0u) == 0x80u && (*(string + 2) & 0xC0u) == 0x80u) { + return 3; + } + if ((*string & 0xF8u) == 0xF0u && (string + 3) < end && + (*(string + 1) & 0xC0u) == 0x80u && (*(string + 2) & 0xC0u) == 0x80u && + (*(string + 3) & 0xC0u) == 0x80u) { + return 4; + } + return -1; +} + } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/json/CMakeLists.txt b/velox/functions/prestosql/json/CMakeLists.txt index 20f4ef810a72..549cf3f15317 100644 --- a/velox/functions/prestosql/json/CMakeLists.txt +++ b/velox/functions/prestosql/json/CMakeLists.txt @@ -15,6 +15,7 @@ velox_add_library( velox_functions_json JsonExtractor.cpp JsonPathTokenizer.cpp + JsonStringUtil.cpp SIMDJsonExtractor.cpp SIMDJsonUtil.cpp) diff --git a/velox/functions/prestosql/json/JsonStringUtil.cpp b/velox/functions/prestosql/json/JsonStringUtil.cpp new file mode 100644 index 000000000000..43be101ec40c --- /dev/null +++ b/velox/functions/prestosql/json/JsonStringUtil.cpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include "folly/Unicode.h" + +#include "velox/common/base/Exceptions.h" +#include "velox/functions/lib/Utf8Utils.h" +#include "velox/functions/prestosql/json/JsonStringUtil.h" + +using namespace facebook::velox::functions; + +namespace facebook::velox { +namespace { + +FOLLY_ALWAYS_INLINE char hexDigit(uint8_t c) { + VELOX_DCHECK_LT(c, 16); + return c < 10 ? c + '0' : c - 10 + 'A'; +} + +FOLLY_ALWAYS_INLINE void writeHex(char16_t value, char*& out) { + value = folly::Endian::little(value); + *out++ = '\\'; + *out++ = 'u'; + *out++ = hexDigit((value >> 12) & 0x0F); + *out++ = hexDigit((value >> 8) & 0x0F); + *out++ = hexDigit((value >> 4) & 0x0F); + *out++ = hexDigit(value & 0x0F); +} + +std::array getAsciiEscapes() { + std::array escapes; + std::fill(escapes.data(), escapes.data() + 32, -1); + escapes['"'] = '"'; + escapes['\\'] = '\\'; + escapes['\b'] = 'b'; + escapes['\t'] = 't'; + escapes['\n'] = 'n'; + escapes['\f'] = 'f'; + escapes['\r'] = 'r'; + return escapes; +} +static const std::array asciiEscapes = getAsciiEscapes(); + +FOLLY_ALWAYS_INLINE void encodeAscii(int8_t value, char*& out) { + int8_t escapeCode = asciiEscapes[value]; + if (escapeCode == 0) { + *out++ = char(value); + } else if (escapeCode > 0) { + *out++ = '\\'; + *out++ = char(escapeCode); + } else { + writeHex(value, out); + } +} + +std::array getEncodedAsciiSizes() { + std::array sizes; + for (int c = 0; c < 128; c++) { + int8_t escapeCode = asciiEscapes[c]; + if (escapeCode == 0) { + sizes[c] = 1; + } else if (escapeCode > 0) { + sizes[c] = 2; + } else { + sizes[c] = 6; + } + } + return sizes; +} +static const std::array encodedAsciiSizes = getEncodedAsciiSizes(); + +// Encode `codePoint` value into one or two UTF-16 code units. Write each code +// unit as prefixed hexadecimals of 6 chars. +FOLLY_ALWAYS_INLINE void encodeUtf16Hex(char32_t codePoint, char*& out) { + VELOX_DCHECK(codePoint <= 0x10FFFFu); + // Two 16-bit code units are needed. + if (codePoint >= 0x10000u) { + writeHex( + static_cast( + 0xD800u + (((codePoint - 0x10000u) >> 10) & 0x3FFu)), + out); + writeHex( + static_cast(0xDC00u + ((codePoint - 0x10000u) & 0x3FFu)), + out); + return; + } + // One 16-bit code unit is needed. + writeHex(static_cast(codePoint), out); +} + +} // namespace + +void testingEncodeUtf16Hex(char32_t codePoint, char*& out) { + encodeUtf16Hex(codePoint, out); +} + +void escapeString(const char* input, size_t length, char* output) { + char* pos = output; + + auto* start = reinterpret_cast(input); + auto* end = reinterpret_cast(input + length); + while (start < end) { + int count = validateAndGetNextUtf8Length(start, end); + switch (count) { + case 1: { + encodeAscii(int8_t(*start), pos); + start++; + continue; + } + case 2: { + memcpy(pos, reinterpret_cast(start), 2); + pos += 2; + start += 2; + continue; + } + case 3: { + memcpy(pos, reinterpret_cast(start), 3); + pos += 3; + start += 3; + continue; + } + case 4: { + char32_t codePoint = folly::utf8ToCodePoint(start, end, true); + if (codePoint == U'\ufffd') { + writeHex(0xFFFDu, pos); + continue; + } + encodeUtf16Hex(codePoint, pos); + continue; + } + default: { + writeHex(0xFFFDu, pos); + start++; + } + } + } +} + +size_t escapedStringSize(const char* input, size_t length) { + // 6 chars that is returned by `writeHex`. + constexpr size_t kEncodedHexSize = 6; + + size_t outSize = 0; + + auto* start = reinterpret_cast(input); + auto* end = reinterpret_cast(input + length); + while (start < end) { + int count = validateAndGetNextUtf8Length(start, end); + switch (count) { + case 1: + outSize += encodedAsciiSizes[int8_t(*start)]; + break; + case 2: + case 3: + outSize += count; + break; + case 4: + outSize += kEncodedHexSize * 2; + break; + default: + outSize += kEncodedHexSize; + count = 1; + } + start += count; + } + + return outSize; +} + +} // namespace facebook::velox diff --git a/velox/functions/prestosql/json/JsonStringUtil.h b/velox/functions/prestosql/json/JsonStringUtil.h new file mode 100644 index 000000000000..65cadd86bf68 --- /dev/null +++ b/velox/functions/prestosql/json/JsonStringUtil.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace facebook::velox { +/// Escape the unicode characters of `input` to make it canonical for JSON +/// and legal to print in JSON text. It is assumed that the input is UTF-8 +/// encoded. +/// It handles the different unicode planes or code point ranges as follows, +/// 1. Basic Multilingual Plane [0, 0xFFFF] +/// a. [0, 0x7F] ASCII. Input is encoded by one UTF-8 byte. Refer to +/// the `encodeAscii` function for output. +/// b. [0x80, 0x07FF]. Input is encoded by two UTF-8 bytes. Output the UTF-8 +/// encoding of the code point, which are thus identical bytes as +/// the input. +/// c. [0x0800, 0xD7FF] + [0xE000, 0xFFFF]. Input is encoded by three UTF-8 +/// bytes. Output the UTF-8 encoding of the code point, which are thus +/// identical bytes as the input. +/// 2. 16 Supplementary Planes [0x10000, 0x10FFFF] +/// a. [0x10000, 0x10FFFF]. Input is encoded by four UTF-8 bytes. Output +/// the UTF-16 encoding of the code point, with two UTF-16 code units in +/// uppercase hexadecimal and prefixed with '\' and 'u'. +/// For illegal code point value or invalid UTF-8 input, return "\uFFFD". +/// @param input: Input string to escape that is UTF-8 encoded. +/// @param length: Length of the input string. +/// @param output: Output string to write the escaped input to. The caller is +/// responsible to allocate enough space for output. +void escapeString(const char* input, size_t length, char* output); + +/// Return the size of string after the unicode characters of `input` are +/// escaped using the method as in`escapeString`. The function will iterate +/// over `input` once. +/// @param input: Input string to escape that is UTF-8 encoded. +/// @param length: Length of the input string. +size_t escapedStringSize(const char* input, size_t length); + +/// For test only. Encode `codePoint` value by UTF-16 and write the one or two +/// prefixed hexadecimals to `out`. Move `out` forward by 6 or 12 chars +/// accordingly. The caller shall ensure there is enough space in `out`. +void testingEncodeUtf16Hex(char32_t codePoint, char*& out); +} // namespace facebook::velox diff --git a/velox/functions/prestosql/tests/JsonCastTest.cpp b/velox/functions/prestosql/tests/JsonCastTest.cpp index 0e2aca2a5db7..b89a14f5a186 100644 --- a/velox/functions/prestosql/tests/JsonCastTest.cpp +++ b/velox/functions/prestosql/tests/JsonCastTest.cpp @@ -13,7 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "folly/Unicode.h" +#include "velox/functions/prestosql/json/JsonStringUtil.h" #include "velox/functions/prestosql/tests/CastBaseTest.h" #include "velox/functions/prestosql/types/JsonType.h" @@ -245,27 +247,71 @@ TEST_F(JsonCastTest, fromInvalidUtf8) { auto invalidString = fromBytes({0xBF}); testCastToJson( - VARCHAR(), {StringView(invalidString)}, {"\"\\ufffd\""}); + VARCHAR(), {StringView(invalidString)}, {"\"\\uFFFD\""}); invalidString = fmt::format("head_{}_tail", fromBytes({0xBF})); testCastToJson( - VARCHAR(), {StringView(invalidString)}, {"\"head_\\ufffd_tail\""}); + VARCHAR(), {StringView(invalidString)}, {"\"head_\\uFFFD_tail\""}); } TEST_F(JsonCastTest, fromVarchar) { - testCastToJson(VARCHAR(), {"\U0001F64F"}, {"\"\\ud83d\\ude4f\""}); - testCastToJson( - VARCHAR(), - {"aaa"_sv, "bbb"_sv, "ccc"_sv}, - {R"("aaa")"_sv, R"("bbb")"_sv, R"("ccc")"_sv}); + // Test casting from ASCII. + { + std::vector asciiCharacters; + for (int c = 32; c < 0x80; c++) { + if (c != '\"' && c != '\\') { + asciiCharacters.push_back(c); + } + } + std::string asciiString = folly::join("", asciiCharacters); + std::string expected = fmt::format("\"{}\"", asciiString); + testCastToJson( + VARCHAR(), {StringView(asciiString)}, {StringView(expected)}); + + testCastToJson( + VARCHAR(), + {"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\ ."_sv}, + {R"("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000B\f\r\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\"\\ .")"_sv}); + } + + // Test casting from unicodes in BMP. + { + std::vector charactersInUtf8; + for (int i = 0x80; i < 0x10000; i++) { + if (folly::utf16_code_unit_is_bmp(char16_t(i))) { + charactersInUtf8.push_back(folly::codePointToUtf8(char32_t(i))); + } + } + std::string utf8String = folly::join("", charactersInUtf8); + std::string expected = fmt::format("\"{}\"", utf8String); + testCastToJson( + VARCHAR(), {StringView(utf8String)}, {StringView(expected)}); + } + + // Test casting from unicodes in supplementary planes. + { + std::vector charactersInUtf8; + std::vector charactersInUtf16; + for (int i = 0x10000; i < 0x110000; i++) { + charactersInUtf8.push_back(folly::codePointToUtf8(char32_t(i))); + + std::string utf16Hex(12, '\0'); + char* pos = utf16Hex.data(); + testingEncodeUtf16Hex(char32_t(i), pos); + charactersInUtf16.push_back(utf16Hex); + } + std::string utf8String = folly::join("", charactersInUtf8); + std::string expected = + fmt::format("\"{}\"", folly::join("", charactersInUtf16)); + testCastToJson( + VARCHAR(), {StringView(utf8String)}, {StringView(expected)}); + } + testCastToJson( VARCHAR(), - {""_sv, - std::nullopt, - "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\ ."_sv}, - {"\"\""_sv, - std::nullopt, - R"("\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f\"\\ .")"_sv}); + {""_sv, std::nullopt, "\xc0"_sv}, + {"\"\""_sv, std::nullopt, R"("\uFFFD")"_sv}); + testCastToJson( VARCHAR(), {std::nullopt, std::nullopt, std::nullopt, std::nullopt}, diff --git a/velox/functions/prestosql/types/JsonType.cpp b/velox/functions/prestosql/types/JsonType.cpp index 7875606f7870..35558028336f 100644 --- a/velox/functions/prestosql/types/JsonType.cpp +++ b/velox/functions/prestosql/types/JsonType.cpp @@ -21,7 +21,6 @@ #include #include -#include "folly/CPortability.h" #include "folly/Conv.h" #include "folly/json.h" @@ -32,6 +31,7 @@ #include "velox/expression/VectorWriters.h" #include "velox/functions/lib/RowsTranslationUtil.h" #include "velox/functions/lib/string/StringCore.h" +#include "velox/functions/prestosql/json/JsonStringUtil.h" #include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/type/Conversions.h" #include "velox/type/Type.h" @@ -50,16 +50,11 @@ void generateJsonTyped( auto value = input.valueAt(row); if constexpr (std::is_same_v) { - // TODO Presto escapes Unicode characters using uppercase hex: - // SELECT cast(U&'\+01F64F' as json); -- "\uD83D\uDE4F" - // Folly uses lowercase hex digits: "\ud83d\ude4f". - // Figure out how to produce uppercase digits. - folly::json::serialization_opts opts; - opts.encode_non_ascii = true; - // Replace invalid UTF-8 bytes with U+FFFD. - opts.skip_invalid_utf8 = true; - - folly::json::escapeString(value, result, opts); + size_t resultSize = escapedStringSize(value.data(), value.size()); + result.resize(resultSize + 2); + result.data()[0] = '"'; + escapeString(value.data(), value.size(), result.data() + 1); + result.data()[resultSize + 1] = '"'; } else if constexpr (std::is_same_v) { VELOX_FAIL( "Casting UNKNOWN to JSON: Vectors of UNKNOWN type should not contain non-null rows");