pola-rs · stinodego · Feb 21, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,30 @@
+name: Lint
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get ruff version from requirements file
+        id: version
+        run: |
+          VERSION=$(grep -m 1 -oP 'ruff==\K(.*)' requirements.txt)
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+
+      # - uses: chartboost/ruff-action@v1
+      #   with:
+      #     version: ${{ steps.version.outputs.version }}
+      #     args: check --no-fix
+
+      - uses: chartboost/ruff-action@v1
+        with:
+          version: ${{ steps.version.outputs.version }}
+          args: format --diff
diff --git a/Makefile b/Makefile
@@ -1,62 +1,95 @@
-SHELL=/bin/bash
-PYTHON=.venv/bin/python
+.DEFAULT_GOAL := help
 
-.venv:
-	@python -m venv .venv
-	@.venv/bin/pip install -U pip
-	@.venv/bin/pip install --no-cache-dir -r requirements.txt
+PYTHONPATH=
+SHELL=/bin/bash
+VENV=.venv
+VENV_BIN=$(VENV)/bin
 
-clean-tpch-dbgen:
-	$(MAKE) -C tpch-dbgen clean
+.venv:  ## Set up Python virtual environment and install requirements
+	python3 -m venv $(VENV)
+	$(MAKE) requirements
 
-clean-venv:
-	rm -r .venv
+.PHONY: requirements
+requirements: .venv  ## Update Python project requirements
+	$(VENV_BIN)/python -m pip install --upgrade pip
+	$(VENV_BIN)/pip install --upgrade -r requirements.txt
 
-clean-tables:
-	rm -r tables_scale_*
+.PHONY: fmt
+fmt:  ## Run autoformatting and linting
+	$(VENV_BIN)/ruff check
+	$(VENV_BIN)/ruff format
 
-clean: clean-tpch-dbgen clean-venv
+.PHONY: pre-commit
+pre-commit: fmt  ## Run all code quality checks
 
-tables_scale_1: .venv
+.PHONY: tables-scale-1
+tables-scale-1: .venv  ## Generate data tables
 	$(MAKE) -C tpch-dbgen all
 	cd tpch-dbgen && ./dbgen -vf -s 1 && cd ..
 	mkdir -p "tables_scale_1"
 	mv tpch-dbgen/*.tbl tables_scale_1/
-	.venv/bin/python prepare_files.py 1
+	$(VENV_BIN)/python prepare_files.py 1
 
-tables_scale_10: .venv
+.PHONY: tables-scale-10
+tables-scale-10: .venv  ## Generate bigger data tables
 	$(MAKE) -C tpch-dbgen all
 	cd tpch-dbgen && ./dbgen -vf -s 10 && cd ..
 	mkdir -p "tables_scale_10"
 	mv tpch-dbgen/*.tbl tables_scale_10/
-	.venv/bin/python prepare_files.py 10
+	$(VENV_BIN)/python prepare_files.py 10
+
+.PHONY: run-polars
+run-polars: .venv  ## Run polars benchmarks
+	$(VENV_BIN)/python -m polars_queries.executor
+
+.PHONY: run-pandas
+run-pandas: .venv  ## Run pandas benchmarks
+	$(VENV_BIN)/python -m pandas_queries.executor
+
+.PHONY: run-pyspark
+run-pyspark: .venv  ## Run pyspark benchmarks
+	$(VENV_BIN)/python -m spark_queries.executor
+
+.PHONY: run-dask
+run-dask: .venv  ## Run dask benchmarks
+	$(VENV_BIN)/python -m dask_queries.executor
+
+.PHONY: run-duckdb
+run-duckdb: .venv  ## Run duckdb benchmarks
+	$(VENV_BIN)/python -m duckdb_queries.executor
+
+.PHONY: run-vaex
+run-vaex: .venv  ## Run vaex benchmarks
+	$(VENV_BIN)/python -m vaex_queries.executor
 
-run_polars: .venv
-	.venv/bin/python -m polars_queries.executor
+.PHONY: run-modin
+run-modin: .venv  ## Run modin benchmarks
+	$(VENV_BIN)/python -m modin_queries.executor
 
-run_pandas: .venv
-	.venv/bin/python -m pandas_queries.executor
+.PHONY: run-all
+run-all: run-polars run-pandas run-pyspark run-dask run-duckdb run-vaex run-modin   ## Run all benchmarks
 
-run_dask: .venv
-	.venv/bin/python -m dask_queries.executor
+.PHONY: plot
+plot: .venv  ## Plot results
+	$(VENV_BIN)/python -m scripts.plot_results
 
-run_modin: .venv
-	.venv/bin/python -m modin_queries.executor
 
-run_vaex: .venv
-	.venv/bin/python -m vaex_queries.executor
+.PHONY: clean
+clean:  clean-tpch-dbgen clean-tables  ## Clean up everything
+	@rm -rf .ruff_cache/
+	@rm -rf .venv/
 
-run_spark: .venv
-	.venv/bin/python -m spark_queries.executor
+.PHONY: clean-tpch-dbgen
+clean-tpch-dbgen:  ## Clean up TPC-H folder
+	@$(MAKE) -C tpch-dbgen clean
 
-run_duckdb: .venv
-	.venv/bin/python -m duckdb_queries.executor
+.PHONY: clean-tables
+clean-tables:  ## Clean up data tables
+	@rm -rf tables_scale_*
 
-plot_results: .venv
-	.venv/bin/python -m scripts.plot_results
 
-run_all: run_polars run_pandas run_vaex run_dask run_modin run_spark
+.PHONY: help
+help:  ## Display this help screen
+	@echo -e "\033[1mAvailable commands:\033[0m"
+	@grep -E '^[a-z.A-Z_0-9-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2}' | sort
 
-pre-commit:
-	.venv/bin/python -m isort .
-	.venv/bin/python -m black .
diff --git a/polars_queries/q12.py b/polars_queries/q12.py
@@ -28,9 +28,7 @@ def q():
                 .then(1)
                 .otherwise(0)
                 .alias("high_line_count"),
-                pl.when(
-                    pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_()
-                )
+                pl.when(pl.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"]).not_())
                 .then(1)
                 .otherwise(0)
                 .alias("low_line_count"),

diff --git a/polars_queries/q21.py b/polars_queries/q21.py
@@ -14,14 +14,12 @@ def q():
     var_1 = "SAUDI ARABIA"
 
     res_1 = (
-        (
-            line_item_ds.group_by("l_orderkey")
-            .agg(pl.col("l_suppkey").n_unique().alias("nunique_col"))
-            .filter(pl.col("nunique_col") > 1)
-            .join(
-                line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
-                on="l_orderkey",
-            )
+        line_item_ds.group_by("l_orderkey")
+        .agg(pl.col("l_suppkey").n_unique().alias("nunique_col"))
+        .filter(pl.col("nunique_col") > 1)
+        .join(
+            line_item_ds.filter(pl.col("l_receiptdate") > pl.col("l_commitdate")),
+            on="l_orderkey",
         )
     ).cache()
 

diff --git a/prepare_files.py b/prepare_files.py
@@ -7,15 +7,11 @@
 h_nation = """n_nationkey
 n_name
 n_regionkey
-n_comment""".split(
-    "\n"
-)
+n_comment""".split("\n")
 
 h_region = """r_regionkey
 r_name
-r_comment""".split(
-    "\n"
-)
+r_comment""".split("\n")
 
 h_part = """p_partkey
 p_name
@@ -25,27 +21,21 @@
 p_size
 p_container
 p_retailprice
-p_comment""".split(
-    "\n"
-)
+p_comment""".split("\n")
 
 h_supplier = """s_suppkey
 s_name
 s_address
 s_nationkey
 s_phone
 s_acctbal
-s_comment""".split(
-    "\n"
-)
+s_comment""".split("\n")
 
 h_partsupp = """ps_partkey
 ps_suppkey
 ps_availqty
 ps_supplycost
-ps_comment""".split(
-    "\n"
-)
+ps_comment""".split("\n")
 
 h_customer = """c_custkey
 c_name
@@ -54,9 +44,7 @@
 c_phone
 c_acctbal
 c_mktsegment
-c_comment""".split(
-    "\n"
-)
+c_comment""".split("\n")
 
 h_orders = """o_orderkey
 o_custkey
@@ -66,9 +54,7 @@
 o_orderpriority
 o_clerk
 o_shippriority
-o_comment""".split(
-    "\n"
-)
+o_comment""".split("\n")
 
 h_lineitem = """l_orderkey
 l_partkey
@@ -85,9 +71,7 @@
 l_receiptdate
 l_shipinstruct
 l_shipmode
-comments""".split(
-    "\n"
-)
+comments""".split("\n")
 
 for name in [
     "nation",

diff --git a/prepare_large_files.py b/prepare_large_files.py
@@ -8,16 +8,12 @@
 n_nationkey
 n_name
 n_regionkey
-n_comment""".split(
-    "\n"
-)
+n_comment""".split("\n")
 
 h_region = """
 r_regionkey
 r_name
-r_comment""".split(
-    "\n"
-)
+r_comment""".split("\n")
 
 h_part = """
 p_partkey
@@ -28,9 +24,7 @@
 p_size
 p_container
 p_retailprice
-p_comment""".split(
-    "\n"
-)
+p_comment""".split("\n")
 
 h_supplier = """
 s_suppkey
@@ -39,18 +33,14 @@
 s_nationkey
 s_phone
 s_acctbal
-s_comment""".split(
-    "\n"
-)
+s_comment""".split("\n")
 
 h_partsupp = """
 ps_partkey
 ps_suppkey
 ps_availqty
 ps_supplycost
-ps_comment""".split(
-    "\n"
-)
+ps_comment""".split("\n")
 
 h_customer = """
 c_custkey
@@ -60,9 +50,7 @@
 c_phone
 c_acctbal
 c_mktsegment
-c_comment""".split(
-    "\n"
-)
+c_comment""".split("\n")
 
 h_orders = """
 o_orderkey
@@ -73,9 +61,7 @@
 o_orderpriority
 o_clerk
 o_shippriority
-o_comment""".split(
-    "\n"
-)
+o_comment""".split("\n")
 
 h_lineitem = """
 l_orderkey
@@ -93,9 +79,7 @@
 l_receiptdate
 l_shipinstruct
 l_shipmode
-comments""".split(
-    "\n"
-)
+comments""".split("\n")
 
 for name in [
     "nation",