Skip to content

Commit

Permalink
Avoid generating unsupported types for JoinFuzzer with PQR (#10662)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #10662

When running JoinFuzzerTest with PrestoQueryRunner, currently only about
16-20% iterations are verified against Presto. The rest iterations are
unverified due to unsupported data types. This diff makes JoinFuzzer to
avoid generating unsupported types when running with PrestoQueryRunner.
After this change, over 85% of iterations are verified against Presto.

Differential Revision: D60768414
  • Loading branch information
kagamiori authored and facebook-github-bot committed Aug 6, 2024
1 parent abbeef5 commit 8f99711
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 12 deletions.
1 change: 1 addition & 0 deletions velox/exec/fuzzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_library(velox_fuzzer_util DuckQueryRunner.cpp PrestoQueryRunner.cpp

target_link_libraries(
velox_fuzzer_util
velox_vector_fuzzer
velox_core
velox_exec_test_lib
cpr::cpr
Expand Down
24 changes: 24 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,30 @@ class DuckQueryRunner : public ReferenceQueryRunner {
public:
DuckQueryRunner();

RunnerType runnerType() const override {
return RunnerType::kDuckQueryRunner;
}

/// Skip queries that use Timestamp, Varbinary, Unknown, and IntervalDayTime
/// types. DuckDB doesn't support nanosecond precision for timestamps or
/// casting from Bigint to Interval.
///
/// TODO Investigate mismatches reported when comparing Varbinary.
std::vector<TypePtr> supportedScalarTypes() const override {
static std::vector<TypePtr> kScalarTypes{
BOOLEAN(),
TINYINT(),
SMALLINT(),
INTEGER(),
BIGINT(),
REAL(),
DOUBLE(),
VARCHAR(),
DATE(),
};
return kScalarTypes;
}

/// Specify names of aggregate function to exclude from the list of supported
/// functions. Used to exclude functions that are non-determonistic, have bugs
/// or whose semantics differ from Velox.
Expand Down
53 changes: 44 additions & 9 deletions velox/exec/fuzzer/JoinFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ namespace facebook::velox::exec::test {

namespace {

std::string makePercentageString(size_t value, size_t total) {
return fmt::format("{} ({:.2f}%)", value, (double)value / total * 100);
}

class JoinFuzzer {
public:
JoinFuzzer(
Expand Down Expand Up @@ -300,6 +304,24 @@ class JoinFuzzer {

VectorFuzzer vectorFuzzer_;
std::unique_ptr<ReferenceQueryRunner> referenceQueryRunner_;

struct Stats {
// The number of iterations verified against reference DB.
size_t numVerified{0};

// The number of iterations that test cross product.
size_t numCrossProduct{0};

void print(size_t totalIterations) const {
LOG(INFO) << "Total iterations tested: " << totalIterations;
LOG(INFO) << "Total iterations verified against reference DB: "
<< makePercentageString(numVerified, totalIterations);
LOG(INFO) << "Total iterations testing cross product: "
<< makePercentageString(numCrossProduct, totalIterations);
}
};

Stats stats_;
};

JoinFuzzer::JoinFuzzer(
Expand Down Expand Up @@ -351,7 +373,8 @@ std::vector<TypePtr> JoinFuzzer::generateJoinKeyTypes(int32_t numKeys) {
types.reserve(numKeys);
for (auto i = 0; i < numKeys; ++i) {
// Pick random scalar type.
types.push_back(vectorFuzzer_.randType(0 /*maxDepth*/));
types.push_back(vectorFuzzer_.randType(
referenceQueryRunner_->supportedScalarTypes(), 0 /*maxDepth*/));
}
return types;
}
Expand All @@ -374,7 +397,8 @@ std::vector<RowVectorPtr> JoinFuzzer::generateProbeInput(
const auto numPayload = randInt(0, 3);
for (auto i = 0; i < numPayload; ++i) {
names.push_back(fmt::format("tp{}", i + keyNames.size()));
types.push_back(vectorFuzzer_.randType(2 /*maxDepth*/));
types.push_back(vectorFuzzer_.randType(
referenceQueryRunner_->supportedScalarTypes(), 2 /*maxDepth*/));
}

const auto inputType = ROW(std::move(names), std::move(types));
Expand Down Expand Up @@ -407,7 +431,8 @@ std::vector<RowVectorPtr> JoinFuzzer::generateBuildInput(
const auto numPayload = randInt(0, 3);
for (auto i = 0; i < numPayload; ++i) {
names.push_back(fmt::format("bp{}", i + buildKeys.size()));
types.push_back(vectorFuzzer_.randType(2 /*maxDepth*/));
types.push_back(vectorFuzzer_.randType(
referenceQueryRunner_->supportedScalarTypes(), 2 /*maxDepth*/));
}

const auto rowType = ROW(std::move(names), std::move(types));
Expand Down Expand Up @@ -619,12 +644,13 @@ std::optional<MaterializedRowMultiset> JoinFuzzer::computeReferenceResults(
const core::PlanNodePtr& plan,
const std::vector<RowVectorPtr>& probeInput,
const std::vector<RowVectorPtr>& buildInput) {
if (containsUnsupportedTypes(probeInput[0]->type())) {
return std::nullopt;
}

if (containsUnsupportedTypes(buildInput[0]->type())) {
return std::nullopt;
if (referenceQueryRunner_->runnerType() == RunnerType::kDuckQueryRunner) {
if (containsUnsupportedTypes(probeInput[0]->type())) {
return std::nullopt;
}
if (containsUnsupportedTypes(buildInput[0]->type())) {
return std::nullopt;
}
}

if (auto sql = referenceQueryRunner_->toSql(plan)) {
Expand Down Expand Up @@ -935,6 +961,9 @@ RowVectorPtr JoinFuzzer::testCrossProduct(
assertEqualResults(
referenceResult.value(), plan.plan->outputType(), {expected}),
"Velox and DuckDB results don't match");

LOG(INFO) << "Result matches with referenc DB.";
stats_.numVerified++;
}
}

Expand Down Expand Up @@ -1008,6 +1037,8 @@ void JoinFuzzer::verify(core::JoinType joinType) {
core::isFullJoin(joinType)) &&
FLAGS_batch_size * FLAGS_num_batches <= 500) {
if (vectorFuzzer_.coinToss(0.1)) {
stats_.numCrossProduct++;

auto result = testCrossProduct(
tableScanDir->getPath(),
joinType,
Expand Down Expand Up @@ -1070,6 +1101,9 @@ void JoinFuzzer::verify(core::JoinType joinType) {
defaultPlan.plan->outputType(),
{expected}),
"Velox and Reference results don't match");

LOG(INFO) << "Result matches with referenc DB.";
stats_.numVerified++;
}
}

Expand Down Expand Up @@ -1453,6 +1487,7 @@ void JoinFuzzer::go() {
reSeed();
++iteration;
}
stats_.print(iteration);
}

} // namespace
Expand Down
20 changes: 20 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,26 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {
std::string user,
std::chrono::milliseconds timeout);

RunnerType runnerType() const override {
return RunnerType::kPrestoQueryRunner;
}

std::vector<TypePtr> supportedScalarTypes() const override {
static std::vector<TypePtr> kScalarTypes{
BOOLEAN(),
TINYINT(),
SMALLINT(),
INTEGER(),
BIGINT(),
REAL(),
DOUBLE(),
VARCHAR(),
VARBINARY(),
TIMESTAMP(),
};
return kScalarTypes;
}

/// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or
/// Window with an optional Project on top.
///
Expand Down
11 changes: 11 additions & 0 deletions velox/exec/fuzzer/ReferenceQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,25 @@
#pragma once

#include "velox/core/PlanNode.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"

namespace facebook::velox::exec::test {

enum RunnerType { kPrestoQueryRunner, kDuckQueryRunner };

/// Query runner that uses reference database, i.e. DuckDB, Presto, Spark.
class ReferenceQueryRunner {
public:
virtual ~ReferenceQueryRunner() = default;

virtual RunnerType runnerType() const = 0;

// Scalar types supported by the reference database, to be used to restrict
// candidates when generating random types for fuzzers.
virtual std::vector<TypePtr> supportedScalarTypes() const {
return defaultScalarTypes();
}

/// Converts Velox plan into SQL accepted by the reference database.
/// @return std::nullopt if the plan uses features not supported by the
/// reference database.
Expand Down
3 changes: 0 additions & 3 deletions velox/vector/fuzzer/VectorFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1025,8 +1025,6 @@ VectorPtr VectorLoaderWrap::makeEncodingPreservedCopy(
std::move(nulls), std::move(indices), vectorSize, baseResult);
}

namespace {

const std::vector<TypePtr> defaultScalarTypes() {
// @TODO Add decimal TypeKinds to randType.
// Refer https://github.com/facebookincubator/velox/issues/3942
Expand All @@ -1046,7 +1044,6 @@ const std::vector<TypePtr> defaultScalarTypes() {
};
return kScalarTypes;
}
} // namespace

TypePtr randType(FuzzerGenerator& rng, int maxDepth) {
return randType(rng, defaultScalarTypes(), maxDepth);
Expand Down
3 changes: 3 additions & 0 deletions velox/vector/fuzzer/VectorFuzzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,7 @@ RowTypePtr randRowType(
const std::vector<TypePtr>& scalarTypes,
int maxDepth = 5);

// Default set of scalar types to be chosen from when generating random types.
const std::vector<TypePtr> defaultScalarTypes();

} // namespace facebook::velox

0 comments on commit 8f99711

Please sign in to comment.