Skip to content

Commit

Permalink
Change unicode escaping in JSON (#10887)
Browse files Browse the repository at this point in the history
Summary:
Change how unicode is escaped in JSON Text. The objective is
to make it consistent with Presto and make a canonical JSON representation.
The implementation is consistent with Java jackson 2.11.0 Utf8Generator

Pull Request resolved: #10887

Reviewed By: Yuhta

Differential Revision: D62591195

Pulled By: gggrace14

fbshipit-source-id: 25235f97e371103197e522d5ffb5c090a7e30888
  • Loading branch information
gggrace14 authored and facebook-github-bot committed Sep 21, 2024
1 parent 1e736ba commit 00e65bc
Show file tree
Hide file tree
Showing 7 changed files with 335 additions and 25 deletions.
1 change: 0 additions & 1 deletion velox/functions/lib/RegistrationHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#pragma once

#include "velox/functions/Registerer.h"
#include "velox/functions/prestosql/types/TimestampWithTimeZoneType.h"

namespace facebook::velox::functions {
namespace {
Expand Down
31 changes: 31 additions & 0 deletions velox/functions/lib/Utf8Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

#include <cstdint>

#include "folly/CPortability.h"

#include "velox/common/base/Exceptions.h"

namespace facebook::velox::functions {

/// This function is not part of the original utf8proc.
Expand Down Expand Up @@ -48,4 +52,31 @@ namespace facebook::velox::functions {
/// https://github.com/airlift/slice/blob/master/src/main/java/io/airlift/slice/SliceUtf8.java
int32_t tryGetCharLength(const char* input, int64_t size);

/// Return the length in byte of the next UTF-8 encoded character at the
/// beginning of `string`. If the beginning of `string` is not valid UTF-8
/// encoding, return -1.
FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length(
const unsigned char* string,
const unsigned char* end) {
VELOX_DCHECK(string < end, "Expect non-empty string.");

if ((*string & 0x80u) == 0) {
return 1;
}
if ((*string & 0xE0u) == 0xC0u && (string + 1) < end &&
(*(string + 1) & 0xC0u) == 0x80u) {
return 2;
}
if ((*string & 0xF0u) == 0xE0u && (string + 2) < end &&
(*(string + 1) & 0xC0u) == 0x80u && (*(string + 2) & 0xC0u) == 0x80u) {
return 3;
}
if ((*string & 0xF8u) == 0xF0u && (string + 3) < end &&
(*(string + 1) & 0xC0u) == 0x80u && (*(string + 2) & 0xC0u) == 0x80u &&
(*(string + 3) & 0xC0u) == 0x80u) {
return 4;
}
return -1;
}

} // namespace facebook::velox::functions
1 change: 1 addition & 0 deletions velox/functions/prestosql/json/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ velox_add_library(
velox_functions_json
JsonExtractor.cpp
JsonPathTokenizer.cpp
JsonStringUtil.cpp
SIMDJsonExtractor.cpp
SIMDJsonUtil.cpp)

Expand Down
184 changes: 184 additions & 0 deletions velox/functions/prestosql/json/JsonStringUtil.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <array>

#include "folly/Unicode.h"

#include "velox/common/base/Exceptions.h"
#include "velox/functions/lib/Utf8Utils.h"
#include "velox/functions/prestosql/json/JsonStringUtil.h"

using namespace facebook::velox::functions;

namespace facebook::velox {
namespace {

FOLLY_ALWAYS_INLINE char hexDigit(uint8_t c) {
VELOX_DCHECK_LT(c, 16);
return c < 10 ? c + '0' : c - 10 + 'A';
}

FOLLY_ALWAYS_INLINE void writeHex(char16_t value, char*& out) {
value = folly::Endian::little(value);
*out++ = '\\';
*out++ = 'u';
*out++ = hexDigit((value >> 12) & 0x0F);
*out++ = hexDigit((value >> 8) & 0x0F);
*out++ = hexDigit((value >> 4) & 0x0F);
*out++ = hexDigit(value & 0x0F);
}

std::array<int8_t, 128> getAsciiEscapes() {
std::array<int8_t, 128> escapes;
std::fill(escapes.data(), escapes.data() + 32, -1);
escapes['"'] = '"';
escapes['\\'] = '\\';
escapes['\b'] = 'b';
escapes['\t'] = 't';
escapes['\n'] = 'n';
escapes['\f'] = 'f';
escapes['\r'] = 'r';
return escapes;
}
static const std::array<int8_t, 128> asciiEscapes = getAsciiEscapes();

FOLLY_ALWAYS_INLINE void encodeAscii(int8_t value, char*& out) {
int8_t escapeCode = asciiEscapes[value];
if (escapeCode == 0) {
*out++ = char(value);
} else if (escapeCode > 0) {
*out++ = '\\';
*out++ = char(escapeCode);
} else {
writeHex(value, out);
}
}

std::array<int8_t, 128> getEncodedAsciiSizes() {
std::array<int8_t, 128> sizes;
for (int c = 0; c < 128; c++) {
int8_t escapeCode = asciiEscapes[c];
if (escapeCode == 0) {
sizes[c] = 1;
} else if (escapeCode > 0) {
sizes[c] = 2;
} else {
sizes[c] = 6;
}
}
return sizes;
}
static const std::array<int8_t, 128> encodedAsciiSizes = getEncodedAsciiSizes();

// Encode `codePoint` value into one or two UTF-16 code units. Write each code
// unit as prefixed hexadecimals of 6 chars.
FOLLY_ALWAYS_INLINE void encodeUtf16Hex(char32_t codePoint, char*& out) {
VELOX_DCHECK(codePoint <= 0x10FFFFu);
// Two 16-bit code units are needed.
if (codePoint >= 0x10000u) {
writeHex(
static_cast<char16_t>(
0xD800u + (((codePoint - 0x10000u) >> 10) & 0x3FFu)),
out);
writeHex(
static_cast<char16_t>(0xDC00u + ((codePoint - 0x10000u) & 0x3FFu)),
out);
return;
}
// One 16-bit code unit is needed.
writeHex(static_cast<char16_t>(codePoint), out);
}

} // namespace

void testingEncodeUtf16Hex(char32_t codePoint, char*& out) {
encodeUtf16Hex(codePoint, out);
}

void escapeString(const char* input, size_t length, char* output) {
char* pos = output;

auto* start = reinterpret_cast<const unsigned char*>(input);
auto* end = reinterpret_cast<const unsigned char*>(input + length);
while (start < end) {
int count = validateAndGetNextUtf8Length(start, end);
switch (count) {
case 1: {
encodeAscii(int8_t(*start), pos);
start++;
continue;
}
case 2: {
memcpy(pos, reinterpret_cast<const char*>(start), 2);
pos += 2;
start += 2;
continue;
}
case 3: {
memcpy(pos, reinterpret_cast<const char*>(start), 3);
pos += 3;
start += 3;
continue;
}
case 4: {
char32_t codePoint = folly::utf8ToCodePoint(start, end, true);
if (codePoint == U'\ufffd') {
writeHex(0xFFFDu, pos);
continue;
}
encodeUtf16Hex(codePoint, pos);
continue;
}
default: {
writeHex(0xFFFDu, pos);
start++;
}
}
}
}

size_t escapedStringSize(const char* input, size_t length) {
// 6 chars that is returned by `writeHex`.
constexpr size_t kEncodedHexSize = 6;

size_t outSize = 0;

auto* start = reinterpret_cast<const unsigned char*>(input);
auto* end = reinterpret_cast<const unsigned char*>(input + length);
while (start < end) {
int count = validateAndGetNextUtf8Length(start, end);
switch (count) {
case 1:
outSize += encodedAsciiSizes[int8_t(*start)];
break;
case 2:
case 3:
outSize += count;
break;
case 4:
outSize += kEncodedHexSize * 2;
break;
default:
outSize += kEncodedHexSize;
count = 1;
}
start += count;
}

return outSize;
}

} // namespace facebook::velox
54 changes: 54 additions & 0 deletions velox/functions/prestosql/json/JsonStringUtil.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

namespace facebook::velox {
/// Escape the unicode characters of `input` to make it canonical for JSON
/// and legal to print in JSON text. It is assumed that the input is UTF-8
/// encoded.
/// It handles the different unicode planes or code point ranges as follows,
/// 1. Basic Multilingual Plane [0, 0xFFFF]
/// a. [0, 0x7F] ASCII. Input is encoded by one UTF-8 byte. Refer to
/// the `encodeAscii` function for output.
/// b. [0x80, 0x07FF]. Input is encoded by two UTF-8 bytes. Output the UTF-8
/// encoding of the code point, which are thus identical bytes as
/// the input.
/// c. [0x0800, 0xD7FF] + [0xE000, 0xFFFF]. Input is encoded by three UTF-8
/// bytes. Output the UTF-8 encoding of the code point, which are thus
/// identical bytes as the input.
/// 2. 16 Supplementary Planes [0x10000, 0x10FFFF]
/// a. [0x10000, 0x10FFFF]. Input is encoded by four UTF-8 bytes. Output
/// the UTF-16 encoding of the code point, with two UTF-16 code units in
/// uppercase hexadecimal and prefixed with '\' and 'u'.
/// For illegal code point value or invalid UTF-8 input, return "\uFFFD".
/// @param input: Input string to escape that is UTF-8 encoded.
/// @param length: Length of the input string.
/// @param output: Output string to write the escaped input to. The caller is
/// responsible to allocate enough space for output.
void escapeString(const char* input, size_t length, char* output);

/// Return the size of string after the unicode characters of `input` are
/// escaped using the method as in`escapeString`. The function will iterate
/// over `input` once.
/// @param input: Input string to escape that is UTF-8 encoded.
/// @param length: Length of the input string.
size_t escapedStringSize(const char* input, size_t length);

/// For test only. Encode `codePoint` value by UTF-16 and write the one or two
/// prefixed hexadecimals to `out`. Move `out` forward by 6 or 12 chars
/// accordingly. The caller shall ensure there is enough space in `out`.
void testingEncodeUtf16Hex(char32_t codePoint, char*& out);
} // namespace facebook::velox
72 changes: 59 additions & 13 deletions velox/functions/prestosql/tests/JsonCastTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "folly/Unicode.h"

#include "velox/functions/prestosql/json/JsonStringUtil.h"
#include "velox/functions/prestosql/tests/CastBaseTest.h"
#include "velox/functions/prestosql/types/JsonType.h"

Expand Down Expand Up @@ -245,27 +247,71 @@ TEST_F(JsonCastTest, fromInvalidUtf8) {
auto invalidString = fromBytes({0xBF});

testCastToJson<StringView>(
VARCHAR(), {StringView(invalidString)}, {"\"\\ufffd\""});
VARCHAR(), {StringView(invalidString)}, {"\"\\uFFFD\""});

invalidString = fmt::format("head_{}_tail", fromBytes({0xBF}));
testCastToJson<StringView>(
VARCHAR(), {StringView(invalidString)}, {"\"head_\\ufffd_tail\""});
VARCHAR(), {StringView(invalidString)}, {"\"head_\\uFFFD_tail\""});
}

TEST_F(JsonCastTest, fromVarchar) {
testCastToJson<StringView>(VARCHAR(), {"\U0001F64F"}, {"\"\\ud83d\\ude4f\""});
testCastToJson<StringView>(
VARCHAR(),
{"aaa"_sv, "bbb"_sv, "ccc"_sv},
{R"("aaa")"_sv, R"("bbb")"_sv, R"("ccc")"_sv});
// Test casting from ASCII.
{
std::vector<char> asciiCharacters;
for (int c = 32; c < 0x80; c++) {
if (c != '\"' && c != '\\') {
asciiCharacters.push_back(c);
}
}
std::string asciiString = folly::join("", asciiCharacters);
std::string expected = fmt::format("\"{}\"", asciiString);
testCastToJson<StringView>(
VARCHAR(), {StringView(asciiString)}, {StringView(expected)});

testCastToJson<StringView>(
VARCHAR(),
{"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\ ."_sv},
{R"("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000B\f\r\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\"\\ .")"_sv});
}

// Test casting from unicodes in BMP.
{
std::vector<std::string> charactersInUtf8;
for (int i = 0x80; i < 0x10000; i++) {
if (folly::utf16_code_unit_is_bmp(char16_t(i))) {
charactersInUtf8.push_back(folly::codePointToUtf8(char32_t(i)));
}
}
std::string utf8String = folly::join("", charactersInUtf8);
std::string expected = fmt::format("\"{}\"", utf8String);
testCastToJson<StringView>(
VARCHAR(), {StringView(utf8String)}, {StringView(expected)});
}

// Test casting from unicodes in supplementary planes.
{
std::vector<std::string> charactersInUtf8;
std::vector<std::string> charactersInUtf16;
for (int i = 0x10000; i < 0x110000; i++) {
charactersInUtf8.push_back(folly::codePointToUtf8(char32_t(i)));

std::string utf16Hex(12, '\0');
char* pos = utf16Hex.data();
testingEncodeUtf16Hex(char32_t(i), pos);
charactersInUtf16.push_back(utf16Hex);
}
std::string utf8String = folly::join("", charactersInUtf8);
std::string expected =
fmt::format("\"{}\"", folly::join("", charactersInUtf16));
testCastToJson<StringView>(
VARCHAR(), {StringView(utf8String)}, {StringView(expected)});
}

testCastToJson<StringView>(
VARCHAR(),
{""_sv,
std::nullopt,
"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\"\\ ."_sv},
{"\"\""_sv,
std::nullopt,
R"("\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f\"\\ .")"_sv});
{""_sv, std::nullopt, "\xc0"_sv},
{"\"\""_sv, std::nullopt, R"("\uFFFD")"_sv});

testCastToJson<StringView>(
VARCHAR(),
{std::nullopt, std::nullopt, std::nullopt, std::nullopt},
Expand Down
Loading

0 comments on commit 00e65bc

Please sign in to comment.