From 613705f10c3520f241bbbee6092c895bff3c3860 Mon Sep 17 00:00:00 2001 From: xylaaaaa <2392805527@qq.com> Date: Sun, 29 Mar 2026 20:40:24 +0800 Subject: [PATCH 1/2] [improvement](build) Limit hive bootstrap data by version ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: Avoid loading every Hive bootstrap dataset for both Hive2 and Hive3 in external regress. Add bootstrap group manifests so each Hive version only prepares and loads its version-specific data while keeping shared assets in the common group. ### Release note None ### Check List (For Author) - Test: Manual test - bash -n docker/thirdparties/docker-compose/hive/scripts/bootstrap/bootstrap-groups.sh - bash -n docker/thirdparties/docker-compose/hive/scripts/prepare-hive-data.sh - bash -n docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh - bash -n docker/thirdparties/run-thirdparties-docker.sh - bash -n regression-test/pipeline/common/get-hive-bootstrap-groups.sh - helper smoke test for hive2/hive3 bootstrap group selection - Behavior changed: Yes (Hive bootstrap now skips version-specific data for the other Hive version) - Does this need documentation: No --- .../docker-compose/hive/hadoop-hive.env.tpl | 1 + .../scripts/bootstrap/bootstrap-groups.sh | 167 ++++++++++++++++++ .../hive2_only.preinstalled_hql.list | 4 + .../scripts/bootstrap/hive2_only.run_sh.list | 1 + .../bootstrap/hive3_only.download_dir.list | 3 + .../scripts/bootstrap/hive3_only.run_sh.list | 3 + .../hive/scripts/hive-metastore.sh | 118 ++++++++----- .../hive/scripts/prepare-hive-data.sh | 140 ++++++--------- .../thirdparties/run-thirdparties-docker.sh | 21 +++ .../common/get-hive-bootstrap-groups.sh | 42 +++++ 10 files changed, 369 insertions(+), 131 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/bootstrap/bootstrap-groups.sh create mode 100644 docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.preinstalled_hql.list create mode 100644 docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.run_sh.list create mode 100644 docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.download_dir.list create mode 100644 docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.run_sh.list create mode 100644 regression-test/pipeline/common/get-hive-bootstrap-groups.sh diff --git a/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl b/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl index d48d497bafa039..382baaa46d5ea6 100644 --- a/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl +++ b/docker/thirdparties/docker-compose/hive/hadoop-hive.env.tpl @@ -64,3 +64,4 @@ HADOOP_HEAPSIZE=4096 NEED_LOAD_DATA=${NEED_LOAD_DATA} LOAD_PARALLEL=${LOAD_PARALLEL} +HIVE_BOOTSTRAP_GROUPS=${HIVE_BOOTSTRAP_GROUPS} diff --git a/docker/thirdparties/docker-compose/hive/scripts/bootstrap/bootstrap-groups.sh b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/bootstrap-groups.sh new file mode 100644 index 00000000000000..07ef8350776a1c --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/bootstrap-groups.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +BOOTSTRAP_HELPER_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +bootstrap_normalize_groups() { + local raw_groups="${1:-}" + local cleaned_groups="${raw_groups// /}" + local parsed_groups=() + local deduped_groups=() + local group="" + local seen="," + + if [[ -z "${cleaned_groups}" ]]; then + echo "all" + return 0 + fi + + IFS=',' read -r -a parsed_groups <<< "${cleaned_groups}" + for group in "${parsed_groups[@]}"; do + [[ -n "${group}" ]] || continue + case "${group}" in + all|common|hive2_only|hive3_only) + ;; + *) + echo "Unknown hive bootstrap group: ${group}" >&2 + return 1 + ;; + esac + + if [[ "${group}" == "all" ]]; then + echo "all" + return 0 + fi + + if [[ "${seen}" == *",${group},"* ]]; then + continue + fi + + seen="${seen}${group}," + deduped_groups+=("${group}") + done + + if (( ${#deduped_groups[@]} == 0 )); then + echo "all" + return 0 + fi + + local old_ifs="${IFS}" + IFS=',' + echo "${deduped_groups[*]}" + IFS="${old_ifs}" +} + +bootstrap_group_enabled() { + local normalized_groups="${1:-all}" + local group="${2}" + + if [[ "${normalized_groups}" == "all" ]]; then + return 0 + fi + + [[ ",${normalized_groups}," == *",${group},"* ]] +} + +bootstrap_merge_groups() { + local groups_input="" + local normalized_groups="" + local include_common=0 + local include_hive2_only=0 + local include_hive3_only=0 + local merged_groups=() + + for groups_input in "$@"; do + normalized_groups="$(bootstrap_normalize_groups "${groups_input}")" || return 1 + if [[ "${normalized_groups}" == "all" ]]; then + echo "all" + return 0 + fi + + bootstrap_group_enabled "${normalized_groups}" "common" && include_common=1 + bootstrap_group_enabled "${normalized_groups}" "hive2_only" && include_hive2_only=1 + bootstrap_group_enabled "${normalized_groups}" "hive3_only" && include_hive3_only=1 + done + + (( include_common == 1 )) && merged_groups+=("common") + (( include_hive2_only == 1 )) && merged_groups+=("hive2_only") + (( include_hive3_only == 1 )) && merged_groups+=("hive3_only") + + if (( ${#merged_groups[@]} == 0 )); then + echo "all" + return 0 + fi + + local old_ifs="${IFS}" + IFS=',' + echo "${merged_groups[*]}" + IFS="${old_ifs}" +} + +bootstrap_list_contains() { + local group="${1}" + local kind="${2}" + local relative_path="${3}" + local list_path="${BOOTSTRAP_HELPER_DIR}/${group}.${kind}.list" + + [[ -f "${list_path}" ]] || return 1 + grep -Fxq "${relative_path}" "${list_path}" +} + +bootstrap_item_group() { + local kind="${1}" + local relative_path="${2}" + local matched_group="" + local group="" + + for group in hive2_only hive3_only; do + if bootstrap_list_contains "${group}" "${kind}" "${relative_path}"; then + if [[ -n "${matched_group}" ]]; then + echo "Bootstrap item ${relative_path} is mapped to multiple groups" >&2 + return 1 + fi + matched_group="${group}" + fi + done + + if [[ -z "${matched_group}" ]]; then + echo "common" + return 0 + fi + + echo "${matched_group}" +} + +bootstrap_item_selected() { + local normalized_groups="${1:-all}" + local kind="${2}" + local relative_path="${3}" + local item_group="" + + item_group="$(bootstrap_item_group "${kind}" "${relative_path}")" || return 1 + bootstrap_group_enabled "${normalized_groups}" "${item_group}" +} + +bootstrap_archive_selected() { + local normalized_groups="${1:-all}" + local relative_archive_path="${2}" + local relative_run_script_path + + relative_run_script_path="$(dirname "${relative_archive_path}")/run.sh" + bootstrap_item_selected "${normalized_groups}" "run_sh" "${relative_run_script_path}" +} diff --git a/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.preinstalled_hql.list b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.preinstalled_hql.list new file mode 100644 index 00000000000000..e8dbd74962e406 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.preinstalled_hql.list @@ -0,0 +1,4 @@ +create_preinstalled_scripts/run67.hql +create_preinstalled_scripts/run80.hql +create_preinstalled_scripts/run81.hql +create_preinstalled_scripts/run84.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.run_sh.list b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.run_sh.list new file mode 100644 index 00000000000000..3d3efc379abf12 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive2_only.run_sh.list @@ -0,0 +1 @@ +data/multi_catalog/hive_config_test/run.sh diff --git a/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.download_dir.list b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.download_dir.list new file mode 100644 index 00000000000000..0144d74187df5e --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.download_dir.list @@ -0,0 +1,3 @@ +data/multi_catalog/logs1_parquet/data +data/multi_catalog/test_complex_types/data +data/multi_catalog/test_wide_table/data diff --git a/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.run_sh.list b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.run_sh.list new file mode 100644 index 00000000000000..f1d0eea8384105 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/bootstrap/hive3_only.run_sh.list @@ -0,0 +1,3 @@ +data/multi_catalog/logs1_parquet/run.sh +data/multi_catalog/test_complex_types/run.sh +data/multi_catalog/test_wide_table/run.sh diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index 69d5af071b78bd..2a7c580848117b 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -18,6 +18,9 @@ set -e -x +. /mnt/scripts/bootstrap/bootstrap-groups.sh +BOOTSTRAP_GROUPS="$(bootstrap_normalize_groups "${HIVE_BOOTSTRAP_GROUPS:-}")" +echo "Load hive data with bootstrap groups: ${BOOTSTRAP_GROUPS}" AUX_LIB="/mnt/scripts/auxlib" for file in "${AUX_LIB}"/*.tar.gz; do @@ -73,68 +76,99 @@ fi hadoop fs -mkdir -p /user/doris/suites/ DATA_DIR="/mnt/scripts/data/" -find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P "${LOAD_PARALLEL}" -I {} bash -ec ' - START_TIME=$(date +%s) - bash -e "{}" || (echo "Failed to executing script: {}" && exit 1) - END_TIME=$(date +%s) - EXECUTION_TIME=$((END_TIME - START_TIME)) - echo "Script: {} executed in $EXECUTION_TIME seconds" -' +run_scripts=() +while IFS= read -r -d '' run_script; do + relative_run_script="${run_script#/mnt/scripts/}" + if bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "run_sh" "${relative_run_script}"; then + run_scripts+=("${run_script}") + fi +done < <(find "${DATA_DIR}" -type f -name "run.sh" -print0) + +if (( ${#run_scripts[@]} > 0 )); then + printf '%s\0' "${run_scripts[@]}" | xargs -0 -P "${LOAD_PARALLEL}" -I {} bash -ec ' + START_TIME=$(date +%s) + bash -e "{}" || (echo "Failed to executing script: {}" && exit 1) + END_TIME=$(date +%s) + EXECUTION_TIME=$((END_TIME - START_TIME)) + echo "Script: {} executed in $EXECUTION_TIME seconds" + ' +fi # put data file hadoop_put_pids=() +hadoop_put_paths=() hadoop fs -mkdir -p /user/doris/ +copy_to_hdfs_if_selected() { + local relative_path="$1" + local local_path="/mnt/scripts/${relative_path}" + + if ! bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "hdfs_dir" "${relative_path}"; then + return + fi + + if [[ ! -e "${local_path}" ]]; then + echo "${local_path} does not exist" + exit 1 + fi + + if [[ -d "${local_path}" && -z "$(ls "${local_path}")" ]]; then + echo "${local_path} does not exist" + exit 1 + fi + + hadoop fs -copyFromLocal -f "${local_path}" /user/doris/ & + hadoop_put_pids+=($!) + hadoop_put_paths+=("${relative_path}") +} + ## put tpch1 -if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then - echo "tpch1.db does not exist" - exit 1 -fi -hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ & -hadoop_put_pids+=($!) +copy_to_hdfs_if_selected "tpch1.db" ## put paimon1 -hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ & -hadoop_put_pids+=($!) +copy_to_hdfs_if_selected "paimon1" ## put tvf_data -if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then - echo "tvf_data does not exist" - exit 1 -fi -hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ & -hadoop_put_pids+=($!) +copy_to_hdfs_if_selected "tvf_data" ## put other preinstalled data -hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ & -hadoop_put_pids+=($!) +copy_to_hdfs_if_selected "preinstalled_data" # wait put finish -wait "${hadoop_put_pids[@]}" -if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then - echo "paimon1 put failed" - exit 1 -fi -if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then - echo "tpch1.db put failed" - exit 1 -fi -if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then - echo "tvf_data put failed" - exit 1 +if (( ${#hadoop_put_pids[@]} > 0 )); then + wait "${hadoop_put_pids[@]}" fi +for relative_path in "${hadoop_put_paths[@]}"; do + if ! hadoop fs -test -e "/user/doris/${relative_path}"; then + echo "${relative_path} put failed" + exit 1 + fi +done + # create tables -ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P "${LOAD_PARALLEL}" -I {} bash -ec ' - START_TIME=$(date +%s) - hive -f {} || (echo "Failed to executing hql: {}" && exit 1) - END_TIME=$(date +%s) - EXECUTION_TIME=$((END_TIME - START_TIME)) - echo "Script: {} executed in $EXECUTION_TIME seconds" -' +shopt -s nullglob +preinstalled_hqls=() +for hql_path in /mnt/scripts/create_preinstalled_scripts/*.hql; do + relative_hql_path="${hql_path#/mnt/scripts/}" + if bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "preinstalled_hql" "${relative_hql_path}"; then + preinstalled_hqls+=("${hql_path}") + fi +done +shopt -u nullglob + +if (( ${#preinstalled_hqls[@]} > 0 )); then + printf '%s\0' "${preinstalled_hqls[@]}" | xargs -0 -P "${LOAD_PARALLEL}" -I {} bash -ec ' + START_TIME=$(date +%s) + hive -f {} || (echo "Failed to executing hql: {}" && exit 1) + END_TIME=$(date +%s) + EXECUTION_TIME=$((END_TIME - START_TIME)) + echo "Script: {} executed in $EXECUTION_TIME seconds" + ' +fi # create view START_TIME=$(date +%s) diff --git a/docker/thirdparties/docker-compose/hive/scripts/prepare-hive-data.sh b/docker/thirdparties/docker-compose/hive/scripts/prepare-hive-data.sh index 7d1c7d8696f2e4..e68a5788fd139d 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/prepare-hive-data.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/prepare-hive-data.sh @@ -18,110 +18,73 @@ set -eo pipefail # under the License. CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -# Extract all tar.gz files under the repo -find ${CUR_DIR}/data -type f -name "*.tar.gz" -print0 | \ -xargs -0 -n1 -P"${LOAD_PARALLEL}" bash -c ' - f="$0" - echo "Extracting hive data $f" - dir=$(dirname "$f") - tar -xzf "$f" -C "$dir" -' +. "${CUR_DIR}/bootstrap/bootstrap-groups.sh" -# download tpch1_data -if [[ ! -d "${CUR_DIR}/tpch1.db" ]]; then - echo "${CUR_DIR}/tpch1.db does not exist" - cd ${CUR_DIR}/ - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/tpch1.db.tar.gz - tar -zxf tpch1.db.tar.gz - rm -rf tpch1.db.tar.gz - cd - -else - echo "${CUR_DIR}/tpch1.db exist, continue !" +BOOTSTRAP_GROUPS="$(bootstrap_normalize_groups "${HIVE_BOOTSTRAP_GROUPS:-}")" +echo "Prepare hive data with bootstrap groups: ${BOOTSTRAP_GROUPS}" + +extract_archives=() +while IFS= read -r -d '' archive_path; do + relative_archive_path="${archive_path#${CUR_DIR}/}" + if bootstrap_archive_selected "${BOOTSTRAP_GROUPS}" "${relative_archive_path}"; then + extract_archives+=("${archive_path}") + fi +done < <(find "${CUR_DIR}/data" -type f -name "*.tar.gz" -print0) + +if (( ${#extract_archives[@]} > 0 )); then + printf '%s\0' "${extract_archives[@]}" | xargs -0 -n1 -P"${LOAD_PARALLEL}" bash -c ' + f="$0" + echo "Extracting hive data $f" + dir=$(dirname "$f") + tar -xzf "$f" -C "$dir" + ' fi +download_archive_if_missing() { + local relative_dir="$1" + local workdir="$2" + local remote_path="$3" + local archive_name="$4" + + if ! bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "download_dir" "${relative_dir}"; then + return + fi + + if [[ ! -d "${CUR_DIR}/${relative_dir}" ]]; then + echo "${CUR_DIR}/${relative_dir} does not exist" + pushd "${CUR_DIR}/${workdir}" >/dev/null + curl -O "https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/${remote_path}" + tar -xzf "${archive_name}" + rm -rf "${archive_name}" + popd >/dev/null + else + echo "${CUR_DIR}/${relative_dir} exist, continue !" + fi +} + +# download tpch1_data +download_archive_if_missing "tpch1.db" "." "tpch1.db.tar.gz" "tpch1.db.tar.gz" + # download tvf_data -if [[ ! -d "${CUR_DIR}/tvf_data" ]]; then - echo "${CUR_DIR}/tvf_data does not exist" - cd ${CUR_DIR}/ - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/tvf_data.tar.gz - tar -zxf tvf_data.tar.gz - rm -rf tvf_data.tar.gz - cd - -else - echo "${CUR_DIR}/tvf_data exist, continue !" -fi +download_archive_if_missing "tvf_data" "." "tvf_data.tar.gz" "tvf_data.tar.gz" # download test_complex_types data -if [[ ! -d "${CUR_DIR}/data/multi_catalog/test_complex_types/data" ]]; then - echo "${CUR_DIR}/data/multi_catalog/test_complex_types/data does not exist" - cd "${CUR_DIR}/data/multi_catalog/test_complex_types" - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/multi_catalog/test_complex_types/data.tar.gz - tar xzf data.tar.gz - rm -rf data.tar.gz - cd - -else - echo "${CUR_DIR}/data/multi_catalog/test_complex_types/data exist, continue !" -fi +download_archive_if_missing "data/multi_catalog/test_complex_types/data" "data/multi_catalog/test_complex_types" "multi_catalog/test_complex_types/data.tar.gz" "data.tar.gz" # download test_compress_partitioned data -if [[ ! -d "${CUR_DIR}/data/multi_catalog/test_compress_partitioned/data" ]]; then - echo "${CUR_DIR}/data/multi_catalog/test_compress_partitioned/data does not exist" - cd "${CUR_DIR}/data/multi_catalog/test_compress_partitioned" - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/multi_catalog/test_compress_partitioned/data.tar.gz - tar xzf data.tar.gz - rm -rf data.tar.gz - cd - -else - echo "${CUR_DIR}/data/multi_catalog/test_compress_partitioned/data exist, continue !" -fi +download_archive_if_missing "data/multi_catalog/test_compress_partitioned/data" "data/multi_catalog/test_compress_partitioned" "multi_catalog/test_compress_partitioned/data.tar.gz" "data.tar.gz" # download test_wide_table data -if [[ ! -d "${CUR_DIR}/data/multi_catalog/test_wide_table/data" ]]; then - echo "${CUR_DIR}/data/multi_catalog/test_wide_table/data does not exist" - cd "${CUR_DIR}/data/multi_catalog/test_wide_table" - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/multi_catalog/test_wide_table/data.tar.gz - tar xzf data.tar.gz - rm -rf data.tar.gz - cd - -else - echo "${CUR_DIR}/data/multi_catalog/test_wide_table/data exist, continue !" -fi +download_archive_if_missing "data/multi_catalog/test_wide_table/data" "data/multi_catalog/test_wide_table" "multi_catalog/test_wide_table/data.tar.gz" "data.tar.gz" # download test_hdfs_tvf_compression data -if [[ ! -d "${CUR_DIR}/data/tvf/test_hdfs_tvf_compression/test_data" ]]; then - echo "${CUR_DIR}/data/tvf/test_hdfs_tvf_compression/test_data does not exist" - cd "${CUR_DIR}/data/tvf/test_hdfs_tvf_compression" - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/test_hdfs_tvf_compression/test_data.tar.gz - tar xzf test_data.tar.gz - rm -rf test_data.tar.gz - cd - -else - echo "${CUR_DIR}/data/tvf/test_hdfs_tvf_compression/test_data exist, continue !" -fi +download_archive_if_missing "data/tvf/test_hdfs_tvf_compression/test_data" "data/tvf/test_hdfs_tvf_compression" "test_hdfs_tvf_compression/test_data.tar.gz" "test_data.tar.gz" # download test_tvf data -if [[ ! -d "${CUR_DIR}/data/tvf/test_tvf/tvf" ]]; then - echo "${CUR_DIR}/data/tvf/test_tvf/tvf does not exist" - cd "${CUR_DIR}/data/tvf/test_tvf" - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/test_tvf/data.tar.gz - tar xzf data.tar.gz - rm -rf data.tar.gz - cd - -else - echo "${CUR_DIR}/data/tvf/test_tvf/tvf exist, continue !" -fi +download_archive_if_missing "data/tvf/test_tvf/tvf" "data/tvf/test_tvf" "test_tvf/data.tar.gz" "data.tar.gz" # download logs1_parquet data -if [[ ! -d "${CUR_DIR}/data/multi_catalog/logs1_parquet/data" ]]; then - echo "${CUR_DIR}/data/multi_catalog/logs1_parquet/data does not exist" - cd "${CUR_DIR}/data/multi_catalog/logs1_parquet" - curl -O https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data/multi_catalog/logs1_parquet/data.tar.gz - tar xzf data.tar.gz - rm -rf data.tar.gz - cd - -else - echo "${CUR_DIR}/data/multi_catalog/logs1_parquet/data exist, continue !" -fi +download_archive_if_missing "data/multi_catalog/logs1_parquet/data" "data/multi_catalog/logs1_parquet" "multi_catalog/logs1_parquet/data.tar.gz" "data.tar.gz" # download auxiliary jars jars=( @@ -145,4 +108,3 @@ cd ${CUR_DIR}/auxlib for jar in "${jars[@]}"; do curl -O "https://${s3BucketName}.${s3Endpoint}/regression/docker/hive3/${jar}" done - diff --git a/docker/thirdparties/run-thirdparties-docker.sh b/docker/thirdparties/run-thirdparties-docker.sh index 12e2e9b7ba46d5..8cb93706e75250 100755 --- a/docker/thirdparties/run-thirdparties-docker.sh +++ b/docker/thirdparties/run-thirdparties-docker.sh @@ -26,6 +26,7 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" . "${ROOT}/custom_settings.env" . "${ROOT}/juicefs-helpers.sh" +. "${ROOT}/docker-compose/hive/scripts/bootstrap/bootstrap-groups.sh" usage() { echo " @@ -220,6 +221,13 @@ for element in "${COMPONENTS_ARR[@]}"; do fi done +if [[ "${RUN_HIVE2}" -eq 1 ]] && [[ -z "${HIVE2_BOOTSTRAP_GROUPS+x}" ]]; then + export HIVE2_BOOTSTRAP_GROUPS="common,hive2_only" +fi +if [[ "${RUN_HIVE3}" -eq 1 ]] && [[ -z "${HIVE3_BOOTSTRAP_GROUPS+x}" ]]; then + export HIVE3_BOOTSTRAP_GROUPS="common,hive3_only" +fi + reserve_ports() { if [[ "${NEED_RESERVE_PORTS}" -eq 0 ]]; then return @@ -546,6 +554,8 @@ start_hive2() { # If the doris cluster you need to test is single-node, you can use the default values; If the doris cluster you need to test is composed of multiple nodes, then you need to set the IP_HOST according to the actual situation of your machine #default value export CONTAINER_UID=${CONTAINER_UID} + export HIVE_BOOTSTRAP_GROUPS="${HIVE2_BOOTSTRAP_GROUPS:-}" + echo "Hive2 bootstrap groups: ${HIVE_BOOTSTRAP_GROUPS:-all}" . "${ROOT}"/docker-compose/hive/hive-2x_settings.env envsubst <"${ROOT}"/docker-compose/hive/hive-2x.yaml.tpl >"${ROOT}"/docker-compose/hive/hive-2x.yaml envsubst <"${ROOT}"/docker-compose/hive/hadoop-hive.env.tpl >"${ROOT}"/docker-compose/hive/hadoop-hive-2x.env @@ -560,6 +570,8 @@ start_hive3() { # hive3 # If the doris cluster you need to test is single-node, you can use the default values; If the doris cluster you need to test is composed of multiple nodes, then you need to set the IP_HOST according to the actual situation of your machine export CONTAINER_UID=${CONTAINER_UID} + export HIVE_BOOTSTRAP_GROUPS="${HIVE3_BOOTSTRAP_GROUPS:-}" + echo "Hive3 bootstrap groups: ${HIVE_BOOTSTRAP_GROUPS:-all}" . "${ROOT}"/docker-compose/hive/hive-3x_settings.env envsubst <"${ROOT}"/docker-compose/hive/hive-3x.yaml.tpl >"${ROOT}"/docker-compose/hive/hive-3x.yaml envsubst <"${ROOT}"/docker-compose/hive/hadoop-hive.env.tpl >"${ROOT}"/docker-compose/hive/hadoop-hive-3x.env @@ -760,7 +772,16 @@ if [[ "$NEED_LOAD_DATA" -eq 1 ]]; then fi if [[ $need_prepare_hive_data -eq 1 ]]; then + prepare_hive_bootstrap_groups=() + if [[ "${RUN_HIVE2}" -eq 1 ]]; then + prepare_hive_bootstrap_groups+=("${HIVE2_BOOTSTRAP_GROUPS:-}") + fi + if [[ "${RUN_HIVE3}" -eq 1 ]]; then + prepare_hive_bootstrap_groups+=("${HIVE3_BOOTSTRAP_GROUPS:-}") + fi + export HIVE_BOOTSTRAP_GROUPS="$(bootstrap_merge_groups "${prepare_hive_bootstrap_groups[@]}")" echo "prepare hive2/hive3 data" + echo "Prepare hive bootstrap groups: ${HIVE_BOOTSTRAP_GROUPS}" bash "${ROOT}/docker-compose/hive/scripts/prepare-hive-data.sh" fi diff --git a/regression-test/pipeline/common/get-hive-bootstrap-groups.sh b/regression-test/pipeline/common/get-hive-bootstrap-groups.sh new file mode 100644 index 00000000000000..f84a41e5011405 --- /dev/null +++ b/regression-test/pipeline/common/get-hive-bootstrap-groups.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eo pipefail + +usage() { + echo "Usage: $0 " >&2 + exit 1 +} + +case "${1:-}" in +hive2) + echo "common,hive2_only" + ;; +hive3) + echo "common,hive3_only" + ;; +both) + echo "common,hive2_only,hive3_only" + ;; +all) + echo "all" + ;; +*) + usage + ;; +esac From 3794ef768753c82bea7ddf82d7be50c975508c60 Mon Sep 17 00:00:00 2001 From: xylaaaaa <2392805527@qq.com> Date: Tue, 23 Jun 2026 20:47:40 +0800 Subject: [PATCH 2/2] [fix](hive) handle ORC legacy calendar rebasing --- be/src/format/orc/vorc_reader.cpp | 1 + contrib/apache-orc | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index 05a78ff295c1eb..0cf07c8ef47dee 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -1306,6 +1306,7 @@ Status OrcReader::set_fill_columns( try { _row_reader_options.range(_range_start_offset, _range_size); _row_reader_options.setTimezoneName(_ctz == "CST" ? "Asia/Shanghai" : _ctz); + _row_reader_options.setUseProlepticGregorian(true); if (!_column_ids.empty()) { std::list column_ids_list(_column_ids.begin(), _column_ids.end()); _row_reader_options.includeTypes(column_ids_list); diff --git a/contrib/apache-orc b/contrib/apache-orc index be0f1b73a7aeb7..401461893808c7 160000 --- a/contrib/apache-orc +++ b/contrib/apache-orc @@ -1 +1 @@ -Subproject commit be0f1b73a7aeb78824a03e0dcb692c50a176d513 +Subproject commit 401461893808c736268022a5e2270e90d13d98d1