Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Makefile / use ruff formatter #73

Merged
merged 9 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Lint

on:
pull_request:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Get ruff version from requirements file
id: version
run: |
VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements.txt)
echo "version=$VERSION" >> $GITHUB_OUTPUT

# - uses: chartboost/ruff-action@v1
# with:
# version: ${{ steps.version.outputs.version }}
# args: check --no-fix

- uses: chartboost/ruff-action@v1
with:
version: ${{ steps.version.outputs.version }}
args: format --diff
107 changes: 70 additions & 37 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,62 +1,95 @@
SHELL=/bin/bash
PYTHON=.venv/bin/python
.DEFAULT_GOAL := help

.venv:
@python -m venv .venv
@.venv/bin/pip install -U pip
@.venv/bin/pip install --no-cache-dir -r requirements.txt
PYTHONPATH=
SHELL=/bin/bash
VENV=.venv
VENV_BIN=$(VENV)/bin

clean-tpch-dbgen:
$(MAKE) -C tpch-dbgen clean
.venv: ## Set up Python virtual environment and install requirements
python3 -m venv $(VENV)
$(MAKE) requirements

clean-venv:
rm -r .venv
.PHONY: requirements
requirements: .venv ## Update Python project requirements
$(VENV_BIN)/python -m pip install --upgrade pip
$(VENV_BIN)/pip install --upgrade -r requirements.txt

clean-tables:
rm -r tables_scale_*
.PHONY: fmt
fmt: ## Run autoformatting and linting
$(VENV_BIN)/ruff check
$(VENV_BIN)/ruff format

clean: clean-tpch-dbgen clean-venv
.PHONY: pre-commit
pre-commit: fmt ## Run all code quality checks

tables_scale_1: .venv
.PHONY: tables-scale-1
tables-scale-1: .venv ## Generate data tables
$(MAKE) -C tpch-dbgen all
cd tpch-dbgen && ./dbgen -vf -s 1 && cd ..
mkdir -p "tables_scale_1"
mv tpch-dbgen/*.tbl tables_scale_1/
.venv/bin/python prepare_files.py 1
$(VENV_BIN)/python prepare_files.py 1

tables_scale_10: .venv
.PHONY: tables-scale-10
tables-scale-10: .venv ## Generate bigger data tables
$(MAKE) -C tpch-dbgen all
cd tpch-dbgen && ./dbgen -vf -s 10 && cd ..
mkdir -p "tables_scale_10"
mv tpch-dbgen/*.tbl tables_scale_10/
.venv/bin/python prepare_files.py 10
$(VENV_BIN)/python prepare_files.py 10

.PHONY: run-polars
run-polars: .venv ## Run polars benchmarks
$(VENV_BIN)/python -m polars_queries.executor

.PHONY: run-pandas
run-pandas: .venv ## Run pandas benchmarks
$(VENV_BIN)/python -m pandas_queries.executor

.PHONY: run-pyspark
run-pyspark: .venv ## Run pyspark benchmarks
$(VENV_BIN)/python -m spark_queries.executor

.PHONY: run-dask
run-dask: .venv ## Run dask benchmarks
$(VENV_BIN)/python -m dask_queries.executor

.PHONY: run-duckdb
run-duckdb: .venv ## Run duckdb benchmarks
$(VENV_BIN)/python -m duckdb_queries.executor

.PHONY: run-vaex
run-vaex: .venv ## Run vaex benchmarks
$(VENV_BIN)/python -m vaex_queries.executor

run_polars: .venv
.venv/bin/python -m polars_queries.executor
.PHONY: run-modin
run-modin: .venv ## Run modin benchmarks
$(VENV_BIN)/python -m modin_queries.executor

run_pandas: .venv
.venv/bin/python -m pandas_queries.executor
.PHONY: run-all
run-all: run-polars run-pandas run-pyspark run-dask run-duckdb run-vaex run-modin ## Run all benchmarks

run_dask: .venv
.venv/bin/python -m dask_queries.executor
.PHONY: plot
plot: .venv ## Plot results
$(VENV_BIN)/python -m scripts.plot_results

run_modin: .venv
.venv/bin/python -m modin_queries.executor

run_vaex: .venv
.venv/bin/python -m vaex_queries.executor
.PHONY: clean
clean: clean-tpch-dbgen clean-tables ## Clean up everything
@rm -rf .ruff_cache/
@rm -rf .venv/

run_spark: .venv
.venv/bin/python -m spark_queries.executor
.PHONY: clean-tpch-dbgen
clean-tpch-dbgen: ## Clean up TPC-H folder
@$(MAKE) -C tpch-dbgen clean

run_duckdb: .venv
.venv/bin/python -m duckdb_queries.executor
.PHONY: clean-tables
clean-tables: ## Clean up data tables
@rm -rf tables_scale_*

plot_results: .venv
.venv/bin/python -m scripts.plot_results

run_all: run_polars run_pandas run_vaex run_dask run_modin run_spark
.PHONY: help
help: ## Display this help screen
@echo -e "\033[1mAvailable commands:\033[0m"
@grep -E '^[a-z.A-Z_0-9-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort

pre-commit:
.venv/bin/python -m isort .
.venv/bin/python -m black .
4 changes: 1 addition & 3 deletions polars_queries/q12.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ def q():
.then(1)
.otherwise(0)
.alias("high_line_count"),
pl.when(
pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_()
)
pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_())
.then(1)
.otherwise(0)
.alias("low_line_count"),
Expand Down
14 changes: 6 additions & 8 deletions polars_queries/q21.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@ def q():
var_1 = "SAUDI ARABIA"

res_1 = (
(
line_item_ds.group_by("l_orderkey")
.agg(pl.col("l_suppkey").n_unique().alias("nunique_col"))
.filter(pl.col("nunique_col") > 1)
.join(
line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
on="l_orderkey",
)
line_item_ds.group_by("l_orderkey")
.agg(pl.col("l_suppkey").n_unique().alias("nunique_col"))
.filter(pl.col("nunique_col") > 1)
.join(
line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
on="l_orderkey",
)
).cache()

Expand Down
32 changes: 8 additions & 24 deletions prepare_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,11 @@
h_nation = """n_nationkey
n_name
n_regionkey
n_comment""".split(
"\n"
)
n_comment""".split("\n")

h_region = """r_regionkey
r_name
r_comment""".split(
"\n"
)
r_comment""".split("\n")

h_part = """p_partkey
p_name
Expand All @@ -25,27 +21,21 @@
p_size
p_container
p_retailprice
p_comment""".split(
"\n"
)
p_comment""".split("\n")

h_supplier = """s_suppkey
s_name
s_address
s_nationkey
s_phone
s_acctbal
s_comment""".split(
"\n"
)
s_comment""".split("\n")

h_partsupp = """ps_partkey
ps_suppkey
ps_availqty
ps_supplycost
ps_comment""".split(
"\n"
)
ps_comment""".split("\n")

h_customer = """c_custkey
c_name
Expand All @@ -54,9 +44,7 @@
c_phone
c_acctbal
c_mktsegment
c_comment""".split(
"\n"
)
c_comment""".split("\n")

h_orders = """o_orderkey
o_custkey
Expand All @@ -66,9 +54,7 @@
o_orderpriority
o_clerk
o_shippriority
o_comment""".split(
"\n"
)
o_comment""".split("\n")

h_lineitem = """l_orderkey
l_partkey
Expand All @@ -85,9 +71,7 @@
l_receiptdate
l_shipinstruct
l_shipmode
comments""".split(
"\n"
)
comments""".split("\n")

for name in [
"nation",
Expand Down
32 changes: 8 additions & 24 deletions prepare_large_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,12 @@
n_nationkey
n_name
n_regionkey
n_comment""".split(
"\n"
)
n_comment""".split("\n")

h_region = """
r_regionkey
r_name
r_comment""".split(
"\n"
)
r_comment""".split("\n")

h_part = """
p_partkey
Expand All @@ -28,9 +24,7 @@
p_size
p_container
p_retailprice
p_comment""".split(
"\n"
)
p_comment""".split("\n")

h_supplier = """
s_suppkey
Expand All @@ -39,18 +33,14 @@
s_nationkey
s_phone
s_acctbal
s_comment""".split(
"\n"
)
s_comment""".split("\n")

h_partsupp = """
ps_partkey
ps_suppkey
ps_availqty
ps_supplycost
ps_comment""".split(
"\n"
)
ps_comment""".split("\n")

h_customer = """
c_custkey
Expand All @@ -60,9 +50,7 @@
c_phone
c_acctbal
c_mktsegment
c_comment""".split(
"\n"
)
c_comment""".split("\n")

h_orders = """
o_orderkey
Expand All @@ -73,9 +61,7 @@
o_orderpriority
o_clerk
o_shippriority
o_comment""".split(
"\n"
)
o_comment""".split("\n")

h_lineitem = """
l_orderkey
Expand All @@ -93,9 +79,7 @@
l_receiptdate
l_shipinstruct
l_shipmode
comments""".split(
"\n"
)
comments""".split("\n")

for name in [
"nation",
Expand Down
Loading