Skip to content

Commit

Permalink
Rework xgboost jvm packages for spark
Browse files Browse the repository at this point in the history
- Remove xgboost4j-gpu and move its functionality to xgboost4j-spark-gpu
- Remove any soft links in xgboost4j-spark-gpu
- Abstract an XGBoost Estimator which handles the common functionality for all xgboost estimators.
- Support XGBoostRanker
- Remove any unnecessary ETL in XGBoost
- Fix the missing value usage
- Support uber jar for xgboost4j-spark
- Support uber jar for xgboost4j-spark-gpu
- Rework GPU plugin
- More sannity test to ensure the training/transform results are same between CPU and GPU.
  • Loading branch information
wbo4958 committed Jul 22, 2024
1 parent 6d9fcb7 commit 4916a60
Show file tree
Hide file tree
Showing 125 changed files with 6,120 additions and 8,742 deletions.
11 changes: 11 additions & 0 deletions dev/change_scala_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ def main(args):
)
if nsubs > 0:
replaced_scala_binver = True
# Replace the final name of shaded jar
if "<finalName>" in line:
for artifact in [
"xgboost-spark",
"xgboost-spark-gpu",
]:
line = re.sub(
f"<finalName>{artifact}_[0-9\\.]*",
f"<finalName>{artifact}_{scala_ver}",
line,
)
f.write(line)


Expand Down
2 changes: 2 additions & 0 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,15 @@ XGB_EXTERN_C typedef int XGBCallbackDataIterNext( // NOLINT(*)
* \param data_handle The handle to the data.
* \param callback The callback to get the data.
* \param cache_info Additional information about cache file, can be null.
* \param missing Which value to represent missing value.
* \param out The created DMatrix
* \return 0 when success, -1 when failure happens.
*/
XGB_DLL int XGDMatrixCreateFromDataIter(
DataIterHandle data_handle,
XGBCallbackDataIterNext* callback,
const char* cache_info,
float missing,
DMatrixHandle *out);

/**
Expand Down
3 changes: 2 additions & 1 deletion jvm-packages/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
tracker.py
build.sh
xgboost4j-tester/pom.xml
xgboost4j-tester/iris.csv
dependency-reduced-pom.xml
surefire-reports
4 changes: 2 additions & 2 deletions jvm-packages/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ find_package(JNI REQUIRED)

list(APPEND JVM_SOURCES
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp)
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cpp)

if(USE_CUDA)
list(APPEND JVM_SOURCES
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu)
endif()

add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
Expand Down
67 changes: 67 additions & 0 deletions jvm-packages/aggregator-gpu/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.2.0-SNAPSHOT</version>
</parent>

<name>xgboost-spark-gpu</name>
<artifactId>xgboost-spark-gpu_2.12</artifactId>

<dependencies>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_2.12</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>

<build>
<finalName>xgboost-spark-gpu_2.12-${project.version}</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<createDependencyReducedPom>true</createDependencyReducedPom>
<artifactSet>
<includes>
<include>ml.dmlc:xgboost4j_2.12</include>
<include>ml.dmlc:xgboost4j-spark_2.12</include>
<include>ml.dmlc:xgboost4j-spark-gpu_2.12</include>
</includes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
59 changes: 59 additions & 0 deletions jvm-packages/aggregator/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>2.2.0-SNAPSHOT</version>
</parent>

<name>xgboost-spark</name>
<artifactId>xgboost-spark_2.12</artifactId>

<dependencies>
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_2.12</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_2.12</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>

<build>
<finalName>xgboost-spark_2.12-${project.version}</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<createDependencyReducedPom>true</createDependencyReducedPom>
<artifactSet>
<includes>
<include>ml.dmlc:xgboost4j_2.12</include>
<include>ml.dmlc:xgboost4j-spark_2.12</include>
</includes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
2 changes: 1 addition & 1 deletion jvm-packages/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
<property name="braceAdjustment" value="0"/>
<property name="caseIndent" value="2"/>
<property name="throwsIndent" value="4"/>
<property name="lineWrappingIndentation" value="4"/>
<property name="lineWrappingIndentation" value="2"/>
<property name="arrayInitIndent" value="2"/>
</module>
<module name="ImportOrder">
Expand Down
34 changes: 17 additions & 17 deletions jvm-packages/create_jni.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,6 @@ def native_build(args):
run("cmake .. " + " ".join(args))
run("cmake --build . --config Release" + maybe_parallel_build)

with cd("demo/CLI/regression"):
run(f'"{sys.executable}" mapfeat.py')
run(f'"{sys.executable}" mknfold.py machine.txt 1')

xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" else "xgboost4j"
xgboost4j_spark = (
"xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" else "xgboost4j-spark"
)

print("copying native library")
library_name, os_folder = {
Expand All @@ -155,26 +147,34 @@ def native_build(args):
"arm64": "aarch64", # on macOS & Windows ARM 64-bit
"aarch64": "aarch64",
}[platform.machine().lower()]
output_folder = "{}/src/main/resources/lib/{}/{}".format(
xgboost4j, os_folder, arch_folder
output_folder = "xgboost4j/src/main/resources/lib/{}/{}".format(
os_folder, arch_folder
)
maybe_makedirs(output_folder)
cp("../lib/" + library_name, output_folder)

print("copying train/test files")
maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))

# for xgboost4j
maybe_makedirs("xgboost4j/src/test/resources")
for file in glob.glob("../demo/data/agaricus.*"):
cp(file, "xgboost4j/src/test/resources")

# for xgboost4j-spark
maybe_makedirs("xgboost4j-spark/src/test/resources")
with cd("../demo/CLI/regression"):
run(f'"{sys.executable}" mapfeat.py')
run(f'"{sys.executable}" mknfold.py machine.txt 1')

for file in glob.glob("../demo/CLI/regression/machine.txt.t*"):
cp(file, "{}/src/test/resources".format(xgboost4j_spark))
cp(file, "xgboost4j-spark/src/test/resources")
for file in glob.glob("../demo/data/agaricus.*"):
cp(file, "{}/src/test/resources".format(xgboost4j_spark))
cp(file, "xgboost4j-spark/src/test/resources")

maybe_makedirs("{}/src/test/resources".format(xgboost4j))
for file in glob.glob("../demo/data/agaricus.*"):
cp(file, "{}/src/test/resources".format(xgboost4j))
# for xgboost4j-spark-gpu
if cli_args.use_cuda == "ON":
maybe_makedirs("xgboost4j-spark-gpu/src/test/resources")
for file in glob.glob("../demo/data/veterans_lung_cancer.csv"):
cp(file, "xgboost4j-spark-gpu/src/test/resources")


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 4916a60

Please sign in to comment.