Change unicode escaping in JSON (#10887)

Summary: Change how unicode is escaped in JSON Text. The objective is to make it consistent with Presto and make a canonical JSON representation. The implementation is consistent with Java jackson 2.11.0 Utf8Generator Pull Request resolved: #10887 Reviewed By: Yuhta Differential Revision: D62591195 Pulled By: gggrace14 fbshipit-source-id: 25235f97e371103197e522d5ffb5c090a7e30888
facebookincubator · Sep 21, 2024 · 00e65bc · 00e65bc
1 parent 1e736ba
commit 00e65bc
Show file tree

Hide file tree

Showing 7 changed files with 335 additions and 25 deletions.
diff --git a/velox/functions/lib/RegistrationHelpers.h b/velox/functions/lib/RegistrationHelpers.h
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "velox/functions/Registerer.h"
-#include "velox/functions/prestosql/types/TimestampWithTimeZoneType.h"
 
 namespace facebook::velox::functions {
 namespace {

diff --git a/velox/functions/lib/Utf8Utils.h b/velox/functions/lib/Utf8Utils.h
@@ -17,6 +17,10 @@
 
 #include <cstdint>
 
+#include "folly/CPortability.h"
+
+#include "velox/common/base/Exceptions.h"
+
 namespace facebook::velox::functions {
 
 /// This function is not part of the original utf8proc.
@@ -48,4 +52,31 @@ namespace facebook::velox::functions {
 /// https://github.com/airlift/slice/blob/master/src/main/java/io/airlift/slice/SliceUtf8.java
 int32_t tryGetCharLength(const char* input, int64_t size);
 
+/// Return the length in byte of the next UTF-8 encoded character at the
+/// beginning of `string`. If the beginning of `string` is not valid UTF-8
+/// encoding, return -1.
+FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length(
+    const unsigned char* string,
+    const unsigned char* end) {
+  VELOX_DCHECK(string < end, "Expect non-empty string.");
+
+  if ((*string & 0x80u) == 0) {
+    return 1;
+  }
+  if ((*string & 0xE0u) == 0xC0u && (string + 1) < end &&
+      (*(string + 1) & 0xC0u) == 0x80u) {
+    return 2;
+  }
+  if ((*string & 0xF0u) == 0xE0u && (string + 2) < end &&
+      (*(string + 1) & 0xC0u) == 0x80u && (*(string + 2) & 0xC0u) == 0x80u) {
+    return 3;
+  }
+  if ((*string & 0xF8u) == 0xF0u && (string + 3) < end &&
+      (*(string + 1) & 0xC0u) == 0x80u && (*(string + 2) & 0xC0u) == 0x80u &&
+      (*(string + 3) & 0xC0u) == 0x80u) {
+    return 4;
+  }
+  return -1;
+}
+
 } // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/json/CMakeLists.txt b/velox/functions/prestosql/json/CMakeLists.txt
@@ -15,6 +15,7 @@ velox_add_library(
   velox_functions_json
   JsonExtractor.cpp
   JsonPathTokenizer.cpp
+  JsonStringUtil.cpp
   SIMDJsonExtractor.cpp
   SIMDJsonUtil.cpp)
 

diff --git a/velox/functions/prestosql/json/JsonStringUtil.cpp b/velox/functions/prestosql/json/JsonStringUtil.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <array>
+
+#include "folly/Unicode.h"
+
+#include "velox/common/base/Exceptions.h"
+#include "velox/functions/lib/Utf8Utils.h"
+#include "velox/functions/prestosql/json/JsonStringUtil.h"
+
+using namespace facebook::velox::functions;
+
+namespace facebook::velox {
+namespace {
+
+FOLLY_ALWAYS_INLINE char hexDigit(uint8_t c) {
+  VELOX_DCHECK_LT(c, 16);
+  return c < 10 ? c + '0' : c - 10 + 'A';
+}
+
+FOLLY_ALWAYS_INLINE void writeHex(char16_t value, char*& out) {
+  value = folly::Endian::little(value);
+  *out++ = '\\';
+  *out++ = 'u';
+  *out++ = hexDigit((value >> 12) & 0x0F);
+  *out++ = hexDigit((value >> 8) & 0x0F);
+  *out++ = hexDigit((value >> 4) & 0x0F);
+  *out++ = hexDigit(value & 0x0F);
+}
+
+std::array<int8_t, 128> getAsciiEscapes() {
+  std::array<int8_t, 128> escapes;
+  std::fill(escapes.data(), escapes.data() + 32, -1);
+  escapes['"'] = '"';
+  escapes['\\'] = '\\';
+  escapes['\b'] = 'b';
+  escapes['\t'] = 't';
+  escapes['\n'] = 'n';
+  escapes['\f'] = 'f';
+  escapes['\r'] = 'r';
+  return escapes;
+}
+static const std::array<int8_t, 128> asciiEscapes = getAsciiEscapes();
+
+FOLLY_ALWAYS_INLINE void encodeAscii(int8_t value, char*& out) {
+  int8_t escapeCode = asciiEscapes[value];
+  if (escapeCode == 0) {
+    *out++ = char(value);
+  } else if (escapeCode > 0) {
+    *out++ = '\\';
+    *out++ = char(escapeCode);
+  } else {
+    writeHex(value, out);
+  }
+}
+
+std::array<int8_t, 128> getEncodedAsciiSizes() {
+  std::array<int8_t, 128> sizes;
+  for (int c = 0; c < 128; c++) {
+    int8_t escapeCode = asciiEscapes[c];
+    if (escapeCode == 0) {
+      sizes[c] = 1;
+    } else if (escapeCode > 0) {
+      sizes[c] = 2;
+    } else {
+      sizes[c] = 6;
+    }
+  }
+  return sizes;
+}
+static const std::array<int8_t, 128> encodedAsciiSizes = getEncodedAsciiSizes();
+
+// Encode `codePoint` value into one or two UTF-16 code units. Write each code
+// unit as prefixed hexadecimals of 6 chars.
+FOLLY_ALWAYS_INLINE void encodeUtf16Hex(char32_t codePoint, char*& out) {
+  VELOX_DCHECK(codePoint <= 0x10FFFFu);
+  // Two 16-bit code units are needed.
+  if (codePoint >= 0x10000u) {
+    writeHex(
+        static_cast<char16_t>(
+            0xD800u + (((codePoint - 0x10000u) >> 10) & 0x3FFu)),
+        out);
+    writeHex(
+        static_cast<char16_t>(0xDC00u + ((codePoint - 0x10000u) & 0x3FFu)),
+        out);
+    return;
+  }
+  // One 16-bit code unit is needed.
+  writeHex(static_cast<char16_t>(codePoint), out);
+}
+
+} // namespace
+
+void testingEncodeUtf16Hex(char32_t codePoint, char*& out) {
+  encodeUtf16Hex(codePoint, out);
+}
+
+void escapeString(const char* input, size_t length, char* output) {
+  char* pos = output;
+
+  auto* start = reinterpret_cast<const unsigned char*>(input);
+  auto* end = reinterpret_cast<const unsigned char*>(input + length);
+  while (start < end) {
+    int count = validateAndGetNextUtf8Length(start, end);
+    switch (count) {
+      case 1: {
+        encodeAscii(int8_t(*start), pos);
+        start++;
+        continue;
+      }
+      case 2: {
+        memcpy(pos, reinterpret_cast<const char*>(start), 2);
+        pos += 2;
+        start += 2;
+        continue;
+      }
+      case 3: {
+        memcpy(pos, reinterpret_cast<const char*>(start), 3);
+        pos += 3;
+        start += 3;
+        continue;
+      }
+      case 4: {
+        char32_t codePoint = folly::utf8ToCodePoint(start, end, true);
+        if (codePoint == U'\ufffd') {
+          writeHex(0xFFFDu, pos);
+          continue;
+        }
+        encodeUtf16Hex(codePoint, pos);
+        continue;
+      }
+      default: {
+        writeHex(0xFFFDu, pos);
+        start++;
+      }
+    }
+  }
+}
+
+size_t escapedStringSize(const char* input, size_t length) {
+  // 6 chars that is returned by `writeHex`.
+  constexpr size_t kEncodedHexSize = 6;
+
+  size_t outSize = 0;
+
+  auto* start = reinterpret_cast<const unsigned char*>(input);
+  auto* end = reinterpret_cast<const unsigned char*>(input + length);
+  while (start < end) {
+    int count = validateAndGetNextUtf8Length(start, end);
+    switch (count) {
+      case 1:
+        outSize += encodedAsciiSizes[int8_t(*start)];
+        break;
+      case 2:
+      case 3:
+        outSize += count;
+        break;
+      case 4:
+        outSize += kEncodedHexSize * 2;
+        break;
+      default:
+        outSize += kEncodedHexSize;
+        count = 1;
+    }
+    start += count;
+  }
+
+  return outSize;
+}
+
+} // namespace facebook::velox
diff --git a/velox/functions/prestosql/json/JsonStringUtil.h b/velox/functions/prestosql/json/JsonStringUtil.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace facebook::velox {
+/// Escape the unicode characters of `input` to make it canonical for JSON
+/// and legal to print in JSON text. It is assumed that the input is UTF-8
+/// encoded.
+/// It handles the different unicode planes or code point ranges as follows,
+/// 1. Basic Multilingual Plane [0, 0xFFFF]
+///    a. [0, 0x7F] ASCII. Input is encoded by one UTF-8 byte. Refer to
+///       the `encodeAscii` function for output.
+///    b. [0x80, 0x07FF]. Input is encoded by two UTF-8 bytes. Output the UTF-8
+///       encoding of the code point, which are thus identical bytes as
+///       the input.
+///    c. [0x0800, 0xD7FF] + [0xE000, 0xFFFF]. Input is encoded by three UTF-8
+///       bytes. Output the UTF-8 encoding of the code point, which are thus
+///       identical bytes as the input.
+/// 2. 16 Supplementary Planes [0x10000, 0x10FFFF]
+///    a. [0x10000, 0x10FFFF]. Input is encoded by four UTF-8 bytes. Output
+///       the UTF-16 encoding of the code point, with two UTF-16 code units in
+///       uppercase hexadecimal and prefixed with '\' and 'u'.
+/// For illegal code point value or invalid UTF-8 input, return "\uFFFD".
+/// @param input: Input string to escape that is UTF-8 encoded.
+/// @param length: Length of the input string.
+/// @param output: Output string to write the escaped input to. The caller is
+///                responsible to allocate enough space for output.
+void escapeString(const char* input, size_t length, char* output);
+
+/// Return the size of string after the unicode characters of `input` are
+/// escaped using the method as in`escapeString`. The function will iterate
+/// over `input` once.
+/// @param input: Input string to escape that is UTF-8 encoded.
+/// @param length: Length of the input string.
+size_t escapedStringSize(const char* input, size_t length);
+
+/// For test only. Encode `codePoint` value by UTF-16 and write the one or two
+/// prefixed hexadecimals to `out`. Move `out` forward by 6 or 12 chars
+/// accordingly. The caller shall ensure there is enough space in `out`.
+void testingEncodeUtf16Hex(char32_t codePoint, char*& out);
+} // namespace facebook::velox
diff --git a/velox/functions/prestosql/tests/JsonCastTest.cpp b/velox/functions/prestosql/tests/JsonCastTest.cpp
@@ -13,7 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "folly/Unicode.h"
 
+#include "velox/functions/prestosql/json/JsonStringUtil.h"
 #include "velox/functions/prestosql/tests/CastBaseTest.h"
 #include "velox/functions/prestosql/types/JsonType.h"
 
@@ -245,27 +247,71 @@ TEST_F(JsonCastTest, fromInvalidUtf8) {
   auto invalidString = fromBytes({0xBF});
 
   testCastToJson<StringView>(
-      VARCHAR(), {StringView(invalidString)}, {"\"\\ufffd\""});
+      VARCHAR(), {StringView(invalidString)}, {"\"\\uFFFD\""});
 
   invalidString = fmt::format("head_{}_tail", fromBytes({0xBF}));
   testCastToJson<StringView>(
-      VARCHAR(), {StringView(invalidString)}, {"\"head_\\ufffd_tail\""});
+      VARCHAR(), {StringView(invalidString)}, {"\"head_\\uFFFD_tail\""});
 }
 
 TEST_F(JsonCastTest, fromVarchar) {
-  testCastToJson<StringView>(VARCHAR(), {"\U0001F64F"}, {"\"\\ud83d\\ude4f\""});
-  testCastToJson<StringView>(
-      VARCHAR(),
-      {"aaa"_sv, "bbb"_sv, "ccc"_sv},
-      {R"("aaa")"_sv, R"("bbb")"_sv, R"("ccc")"_sv});
+  // Test casting from ASCII.
+  {
+    std::vector<char> asciiCharacters;
+    for (int c = 32; c < 0x80; c++) {
+      if (c != '\"' && c != '\\') {
+        asciiCharacters.push_back(c);
+      }
+    }
+    std::string asciiString = folly::join("", asciiCharacters);
+    std::string expected = fmt::format("\"{}\"", asciiString);
+    testCastToJson<StringView>(
+        VARCHAR(), {StringView(asciiString)}, {StringView(expected)});
+
+    testCastToJson<StringView>(
+        VARCHAR(),
+        {"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\ ."_sv},
+        {R"("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000B\f\r\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\"\\ .")"_sv});
+  }
+
+  // Test casting from unicodes in BMP.
+  {
+    std::vector<std::string> charactersInUtf8;
+    for (int i = 0x80; i < 0x10000; i++) {
+      if (folly::utf16_code_unit_is_bmp(char16_t(i))) {
+        charactersInUtf8.push_back(folly::codePointToUtf8(char32_t(i)));
+      }
+    }
+    std::string utf8String = folly::join("", charactersInUtf8);
+    std::string expected = fmt::format("\"{}\"", utf8String);
+    testCastToJson<StringView>(
+        VARCHAR(), {StringView(utf8String)}, {StringView(expected)});
+  }
+
+  // Test casting from unicodes in supplementary planes.
+  {
+    std::vector<std::string> charactersInUtf8;
+    std::vector<std::string> charactersInUtf16;
+    for (int i = 0x10000; i < 0x110000; i++) {
+      charactersInUtf8.push_back(folly::codePointToUtf8(char32_t(i)));
+
+      std::string utf16Hex(12, '\0');
+      char* pos = utf16Hex.data();
+      testingEncodeUtf16Hex(char32_t(i), pos);
+      charactersInUtf16.push_back(utf16Hex);
+    }
+    std::string utf8String = folly::join("", charactersInUtf8);
+    std::string expected =
+        fmt::format("\"{}\"", folly::join("", charactersInUtf16));
+    testCastToJson<StringView>(
+        VARCHAR(), {StringView(utf8String)}, {StringView(expected)});
+  }
+
   testCastToJson<StringView>(
       VARCHAR(),
-      {""_sv,
-       std::nullopt,
-       "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\ ."_sv},
-      {"\"\""_sv,
-       std::nullopt,
-       R"("\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f\"\\ .")"_sv});
+      {""_sv, std::nullopt, "\xc0"_sv},
+      {"\"\""_sv, std::nullopt, R"("\uFFFD")"_sv});
+
   testCastToJson<StringView>(
       VARCHAR(),
       {std::nullopt, std::nullopt, std::nullopt, std::nullopt},