Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,7 @@ Status OrcReader::set_fill_columns(
try {
_row_reader_options.range(_range_start_offset, _range_size);
_row_reader_options.setTimezoneName(_ctz == "CST" ? "Asia/Shanghai" : _ctz);
_row_reader_options.setUseProlepticGregorian(true);
if (!_column_ids.empty()) {
std::list<uint64_t> column_ids_list(_column_ids.begin(), _column_ids.end());
_row_reader_options.includeTypes(column_ids_list);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,4 @@ HADOOP_HEAPSIZE=4096

NEED_LOAD_DATA=${NEED_LOAD_DATA}
LOAD_PARALLEL=${LOAD_PARALLEL}
HIVE_BOOTSTRAP_GROUPS=${HIVE_BOOTSTRAP_GROUPS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

BOOTSTRAP_HELPER_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

bootstrap_normalize_groups() {
local raw_groups="${1:-}"
local cleaned_groups="${raw_groups// /}"
local parsed_groups=()
local deduped_groups=()
local group=""
local seen=","

if [[ -z "${cleaned_groups}" ]]; then
echo "all"
return 0
fi

IFS=',' read -r -a parsed_groups <<< "${cleaned_groups}"
for group in "${parsed_groups[@]}"; do
[[ -n "${group}" ]] || continue
case "${group}" in
all|common|hive2_only|hive3_only)
;;
*)
echo "Unknown hive bootstrap group: ${group}" >&2
return 1
;;
esac

if [[ "${group}" == "all" ]]; then
echo "all"
return 0
fi

if [[ "${seen}" == *",${group},"* ]]; then
continue
fi

seen="${seen}${group},"
deduped_groups+=("${group}")
done

if (( ${#deduped_groups[@]} == 0 )); then
echo "all"
return 0
fi

local old_ifs="${IFS}"
IFS=','
echo "${deduped_groups[*]}"
IFS="${old_ifs}"
}

bootstrap_group_enabled() {
local normalized_groups="${1:-all}"
local group="${2}"

if [[ "${normalized_groups}" == "all" ]]; then
return 0
fi

[[ ",${normalized_groups}," == *",${group},"* ]]
}

bootstrap_merge_groups() {
local groups_input=""
local normalized_groups=""
local include_common=0
local include_hive2_only=0
local include_hive3_only=0
local merged_groups=()

for groups_input in "$@"; do
normalized_groups="$(bootstrap_normalize_groups "${groups_input}")" || return 1
if [[ "${normalized_groups}" == "all" ]]; then
echo "all"
return 0
fi

bootstrap_group_enabled "${normalized_groups}" "common" && include_common=1
bootstrap_group_enabled "${normalized_groups}" "hive2_only" && include_hive2_only=1
bootstrap_group_enabled "${normalized_groups}" "hive3_only" && include_hive3_only=1
done

(( include_common == 1 )) && merged_groups+=("common")
(( include_hive2_only == 1 )) && merged_groups+=("hive2_only")
(( include_hive3_only == 1 )) && merged_groups+=("hive3_only")

if (( ${#merged_groups[@]} == 0 )); then
echo "all"
return 0
fi

local old_ifs="${IFS}"
IFS=','
echo "${merged_groups[*]}"
IFS="${old_ifs}"
}

bootstrap_list_contains() {
local group="${1}"
local kind="${2}"
local relative_path="${3}"
local list_path="${BOOTSTRAP_HELPER_DIR}/${group}.${kind}.list"

[[ -f "${list_path}" ]] || return 1
grep -Fxq "${relative_path}" "${list_path}"
}

bootstrap_item_group() {
local kind="${1}"
local relative_path="${2}"
local matched_group=""
local group=""

for group in hive2_only hive3_only; do
if bootstrap_list_contains "${group}" "${kind}" "${relative_path}"; then
if [[ -n "${matched_group}" ]]; then
echo "Bootstrap item ${relative_path} is mapped to multiple groups" >&2
return 1
fi
matched_group="${group}"
fi
done

if [[ -z "${matched_group}" ]]; then
echo "common"
return 0
fi

echo "${matched_group}"
}

bootstrap_item_selected() {
local normalized_groups="${1:-all}"
local kind="${2}"
local relative_path="${3}"
local item_group=""

item_group="$(bootstrap_item_group "${kind}" "${relative_path}")" || return 1
bootstrap_group_enabled "${normalized_groups}" "${item_group}"
}

bootstrap_archive_selected() {
local normalized_groups="${1:-all}"
local relative_archive_path="${2}"
local relative_run_script_path

relative_run_script_path="$(dirname "${relative_archive_path}")/run.sh"
bootstrap_item_selected "${normalized_groups}" "run_sh" "${relative_run_script_path}"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
create_preinstalled_scripts/run67.hql
create_preinstalled_scripts/run80.hql
create_preinstalled_scripts/run81.hql
create_preinstalled_scripts/run84.hql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/multi_catalog/hive_config_test/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data/multi_catalog/logs1_parquet/data
data/multi_catalog/test_complex_types/data
data/multi_catalog/test_wide_table/data
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data/multi_catalog/logs1_parquet/run.sh
data/multi_catalog/test_complex_types/run.sh
data/multi_catalog/test_wide_table/run.sh
118 changes: 76 additions & 42 deletions docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

set -e -x

. /mnt/scripts/bootstrap/bootstrap-groups.sh
BOOTSTRAP_GROUPS="$(bootstrap_normalize_groups "${HIVE_BOOTSTRAP_GROUPS:-}")"
echo "Load hive data with bootstrap groups: ${BOOTSTRAP_GROUPS}"

AUX_LIB="/mnt/scripts/auxlib"
for file in "${AUX_LIB}"/*.tar.gz; do
Expand Down Expand Up @@ -73,68 +76,99 @@ fi
hadoop fs -mkdir -p /user/doris/suites/

DATA_DIR="/mnt/scripts/data/"
find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P "${LOAD_PARALLEL}" -I {} bash -ec '
START_TIME=$(date +%s)
bash -e "{}" || (echo "Failed to executing script: {}" && exit 1)
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
'
run_scripts=()
while IFS= read -r -d '' run_script; do
relative_run_script="${run_script#/mnt/scripts/}"
if bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "run_sh" "${relative_run_script}"; then
run_scripts+=("${run_script}")
fi
done < <(find "${DATA_DIR}" -type f -name "run.sh" -print0)

if (( ${#run_scripts[@]} > 0 )); then
printf '%s\0' "${run_scripts[@]}" | xargs -0 -P "${LOAD_PARALLEL}" -I {} bash -ec '
START_TIME=$(date +%s)
bash -e "{}" || (echo "Failed to executing script: {}" && exit 1)
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
'
fi

# put data file
hadoop_put_pids=()
hadoop_put_paths=()
hadoop fs -mkdir -p /user/doris/

copy_to_hdfs_if_selected() {
local relative_path="$1"
local local_path="/mnt/scripts/${relative_path}"

if ! bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "hdfs_dir" "${relative_path}"; then
return
fi

if [[ ! -e "${local_path}" ]]; then
echo "${local_path} does not exist"
exit 1
fi

if [[ -d "${local_path}" && -z "$(ls "${local_path}")" ]]; then
echo "${local_path} does not exist"
exit 1
fi

hadoop fs -copyFromLocal -f "${local_path}" /user/doris/ &
hadoop_put_pids+=($!)
hadoop_put_paths+=("${relative_path}")
}


## put tpch1
if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then
echo "tpch1.db does not exist"
exit 1
fi
hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ &
hadoop_put_pids+=($!)
copy_to_hdfs_if_selected "tpch1.db"

## put paimon1
hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ &
hadoop_put_pids+=($!)
copy_to_hdfs_if_selected "paimon1"


## put tvf_data
if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then
echo "tvf_data does not exist"
exit 1
fi
hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ &
hadoop_put_pids+=($!)
copy_to_hdfs_if_selected "tvf_data"

## put other preinstalled data
hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ &
hadoop_put_pids+=($!)
copy_to_hdfs_if_selected "preinstalled_data"


# wait put finish
wait "${hadoop_put_pids[@]}"
if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
echo "paimon1 put failed"
exit 1
fi
if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
echo "tpch1.db put failed"
exit 1
fi
if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then
echo "tvf_data put failed"
exit 1
if (( ${#hadoop_put_pids[@]} > 0 )); then
wait "${hadoop_put_pids[@]}"
fi

for relative_path in "${hadoop_put_paths[@]}"; do
if ! hadoop fs -test -e "/user/doris/${relative_path}"; then
echo "${relative_path} put failed"
exit 1
fi
done

# create tables
ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P "${LOAD_PARALLEL}" -I {} bash -ec '
START_TIME=$(date +%s)
hive -f {} || (echo "Failed to executing hql: {}" && exit 1)
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
'
shopt -s nullglob
preinstalled_hqls=()
for hql_path in /mnt/scripts/create_preinstalled_scripts/*.hql; do
relative_hql_path="${hql_path#/mnt/scripts/}"
if bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "preinstalled_hql" "${relative_hql_path}"; then
preinstalled_hqls+=("${hql_path}")
fi
done
shopt -u nullglob

if (( ${#preinstalled_hqls[@]} > 0 )); then
printf '%s\0' "${preinstalled_hqls[@]}" | xargs -0 -P "${LOAD_PARALLEL}" -I {} bash -ec '
START_TIME=$(date +%s)
hive -f {} || (echo "Failed to executing hql: {}" && exit 1)
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
'
fi

# create view
START_TIME=$(date +%s)
Expand Down
Loading
Loading