#!/usr/bin/env bash

## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements.  See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership.  The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License.  You may obtain a copy of the License at
##
##     http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.

# The environment for this sub-script is setup by "xload-common"

function resolveLink() {
  local NAME=$1

  if [ -L "$NAME" ]; then
    case "$OSTYPE" in
      darwin*|bsd*)
        # BSD style readlink behaves differently to GNU readlink
        # Have to manually follow links
        while [ -L "$NAME" ]; do
          NAME=$(readlink -- "$NAME")
        done
        ;;
      *)
        # Assuming standard GNU readlink with -f for
        # canonicalize
        NAME=$(readlink -f -- "$NAME")
        ;;
    esac
  fi

  echo "$NAME"
}

# Pull in common functions
if [ -z "$JENA_HOME" ]; then
  echo "JENA_HOME is not set"
  exit 1
fi
# If JENA_HOME is a symbolic link need to resolve
if [ -L "${JENA_HOME}" ]; then
  JENA_HOME=$(resolveLink "$JENA_HOME")
  # If link is relative
  case "$JENA_HOME" in
    /*)
      # Already absolute
      ;;
    *)
      # Relative, make absolute
      JENA_HOME=$(dirname "$JENA_HOME")
      ;;
  esac
  export JENA_HOME
  echo "Resolved symbolic links for JENA_HOME to $JENA_HOME"
fi

if [ -z "$JAVA" ]
then
    if [ -z "$JAVA_HOME" ]
    then
	JAVA="$(which java)"
    else
        JAVA="$JAVA_HOME/bin/java"
    fi
fi

if [ -z "$JAVA" ]
then
    (
	echo "Cannot find a Java JDK."
	echo "Please set either set JAVA or JAVA_HOME and put java (>=Java 11) in your PATH."
    ) 1>&2
  exit 1
fi

if [ -e "${LOADER_SCRIPTS}/xload-common" ]; then
  # Can source common functions
  source "${LOADER_SCRIPTS}/xload-common"
else
  echo "Unable to locate common functions script xload-common (index phase)"
  exit 1
fi

function printUsage() {
  cat << EOF
xload-index - TDB2 Bulk Loader - Index Phase

Usage: xload-index --loc <Directory> [Options]

Bulk Loader for TDB2 which generates the Index files based upon the
temporary data files generated by xload-data.  This command relies
on POSIX utilities so will only work on POSIX operating systems.

This command can only be used to create new database. If you wish to
bulk load to an existing database please use tdbloader instead.

Required options are as follows:

  -l <DatabaseDirectory>
  --loc <DatabaseDirectory>
    Sets the location in which the database should be created.

    This location must be a directory and must be empty, if a
    non-existent path is specified it will be created as a new
    directory.

Common additional options are as follows:

  -h
  --help
    Prints this help summary and exits

Advanced additional options are as follows:

  -d
  --debug
    Enable debug mode, adds extra debug output

  -j <JvmArgs>
  --jvm-args <JvmArgs>
    Sets the arguments that should be passed to the JVM for the
    JVM based portions of the build.

    Generally it is best to not change these unless you have been
    specifically advised to.  The scripts will use appropriate
    defaults if this is not specified.

    In particular be careful increasing the heap size since many
    parts of TDB2 actually use memory mapped files that live
    outside the heap so if the heap is too large the heap may
    conflict with the memory mapped files for memory space.

  -k
  --keep-work
    Keeps the temporary work files around after they are no longer
    needed.  May be useful for debugging.

  -s <SortArgs>
  --sort-args <SortArgs>
    Sets the arguments that should be passed to sort for the sort
    based portions of the build.

    Generally it is best not to change these as the scripts will
    use appropriate defaults for your system.

  -t
  --trace
    Enable trace mode, essentially sets -x within the scripts
EOF
}

# Exit on error.
set -e

# Sort order is ASCII
export LC_ALL="C"

# Process Arguments
LOC=
KEEP_WORK=0
DEBUG=0
JVM_ARGS=
SORT_ARGS="${SORT_ARGS:-}"
if [ -n "$SORT_ARGS" ]; then
  echo "Using SORT_ARGS: $SORT_ARGS"
fi

while [ $# -gt 0 ]
do
  ARG=$1
  case "$ARG" in
    -d|--debug)
      # Debug Mode
      shift
      DEBUG=1
      ;;
    -h|--help)
      printUsage
      exit 0
      ;;
    -j|--jvm-args)
      # JVM Arguments
      shift
      JVM_ARGS="$1"
      shift
      ;;
    -k|--keep-work)
      # Keep work files
      shift
      KEEP_WORK=1
      ;;
    -l|--loc|-loc)
      # Location space separated
      shift
      LOC="$1"
      shift
      ;;
    -*loc=*)
      # Location = separated
      LOC=${ARG/-*loc=/}
      shift
      ;;
    -s|--sort-args)
      # Sort arguments
      shift
      SORT_ARGS=$1
      shift
      ;;
    -t|--trace)
      # Trace mode
      shift
      set -x
      ;;
    *)
      # Additional options are not supported
      abort 1 "Unrecognized option $ARG"
      ;;
  esac
done

# Verify arguments
if [ -z "$LOC" ]; then
  abort 1 "Required database location not specified"
fi

# Make LOC absolute
ABS_LOC=$(makeAbsolute "$LOC")
if [ "$ABS_LOC" != "$LOC" ]; then
  LOC="$ABS_LOC"
  debug "Absolute database location is $LOC"
fi

# Check location
if [ ! -e "$LOC" ]; then
  abort 1 "Database location specified does not exist: $LOC"
fi
if [ ! -d "$LOC" ]; then
  abort 1 "Database location is not a directory: $LOC"
fi

# Locate and check data text files
DATA_TRIPLES="$LOC/data-triples.tmp"
DATA_QUADS="$LOC/data-quads.tmp"

if [ ! -e "$DATA_TRIPLES" ]; then
  abort 1 "No triples text file found in database location, please run the xload-data script first"
fi
if [ ! -e "$DATA_QUADS" ]; then
  abort 1 "No quads text file found in database location, please run the xload-data script first"
fi

debug "Data text files are $DATA_TRIPLES and $DATA_QUADS"

# Prepare sort arguments
if [ -z "$SORT_ARGS" ]; then
    SORT_ARGS="--buffer-size=66%"
    # --parallel is not always available.
    # Temporarily disable exit on error while we check for --parallel support
    set +e
    sort --parallel=3 < /dev/null 2>/dev/null
    if [ $? = 0 ]; then
        SORT_ARGS="$SORT_ARGS --parallel=3"
    fi
    set -e
fi

# Prepare JVM arguments
JVM_ARGS=${JVM_ARGS:--Xmx1200M}
debug "JVM Arguments are $JVM_ARGS"

# Classpath set in "tdb2.xloader"
if [ -z "$JENA_CP" ]; then
  abort 1 "Classpath not provided : set JENA_CP"
fi
debug "Jena Classpath is $JENA_CP"

# ---- Index intermediates
# All files are written S P O / G S P O columns per row but in different sort orders.
info "Index Building Phase"

# Check where we are storing temporary sort files
debug "Sort Arguments: $SORT_ARGS"
SORT_TEMP_DIR=
if [[ "$SORT_ARGS" == *"-T "* ]]; then
  # Specified via -T argument
  SORT_TEMP_DIR=(${SORT_ARGS/-T /})
  SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
elif [[ "$SORT_ARGS" == *"--temporary-directory="* ]]; then
  # Specified via --temporary-directory argument
  SORT_TEMP_DIR=(${SORT_ARGS/--temporary-directory=/})
  SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
else
  # Using the system temp directory
  SORT_TEMP_DIR="$TMPDIR"
fi
if [ -n "$SORT_TEMP_DIR" ]; then
  # If we've figured out the sort temporary directory then check it
  SORT_TEMP_DIR=$(makeAbsolute "$SORT_TEMP_DIR")
  debug "Sort Temp Directory: $SORT_TEMP_DIR"
  SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
  if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then
    debug "Sort Temp Directory is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"

    if [ "${SORT_DRIVE_INFO[2]}" -le 10 ]; then
      warn "-----"
      warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_INFO[0]} which only has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes) available"
      warn "This may result in sort failures if the data to be indexed is large"
      warn "-----"
    fi
  fi
fi

generate_index()
{
    local KEYS="$1"
    local DATA="$2"
    local IDX=$3
    local WORK="$LOC/$IDX-txt"

    if [ ! -s "$DATA" ]; then
      debug "Skipping Index $IDX as no relevant data to index"
      return
    fi

    info "Creating Index $IDX"

    # For various purposes we need to know the size of the input data
    local SIZE=$(getSize "$DATA")
    debug "Size of data to be sorted is $SIZE bytes"

    # Verify that we have enough space to sort the data

    # Firstly check that the output disk has sufficient space
    local WORK_DRIVE_INFO=($(getDriveInfo "$LOC"))
    if [ "${#WORK_DRIVE_INFO[@]}" -gt 0 ]; then
      if [ "${SIZE}" -ge "${WORK_DRIVE_INFO[3]}" ]; then
        # If there is insufficient disk space then we can abort now
        abort 1 "Insufficient free space on database drive ${WORK_DRIVE_INFO[0]}, there are ${WORK_DRIVE_INFO[3]} bytes free but ${SIZE} bytes are required"
      else
        debug "Sufficient free space on database drive ${WORK_DRIVE_INFO[0]} to attempt sorting data file ${DATA} (${SIZE} bytes required from ${WORK_DRIVE_INFO[3]} bytes free)"
      fi
    fi

    # Secondly check if there is enough space to sort in-memory or if sort may need to do an external sort
    # We only issue warnings when the sort is likely to be external because there are various factors
    # such as virtual memory and OS file caching that may complicate this
    FREE_MEM=$(getFreeMem)
    if [ "$FREE_MEM" -ge 0 ]; then
      if [ "$SIZE" -ge "$FREE_MEM" ]; then
        debug "Insufficient free memory to sort data in-memory, sort will need to perform an external sort using Temp Directory ${SORT_TEMP_DIR}"

        # Check for disk space on temporary disk
        if [ -n "${SORT_TEMP_DIR}" ]; then
          SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
          if [ "${#SORT_DRIVE_INFO[@]}" -gt 0 ]; then
            if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then
              warn "There may be insufficient for sort to perform an external sort using Temp Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)"
            fi
          fi
        fi
      else
        debug "Should be sufficient free memory ($FREE_MEM bytes) for sort to be fully in-memory"
      fi
    else
      debug "Unable to determine free memory on your OS, can't check whether sort will be in-memory or external sort using Temp Directory ${SORT_TEMP_DIR}"
    fi

    # Sort the input data
    info "Sort $IDX"
    debug "Sorting $DATA into work file $WORK"
    sort $SORT_ARGS -u $KEYS < "$DATA" > "$WORK"
    info "Sort $IDX Completed"

    # Build into an index
    info "Build $IDX"
    rm -f "$LOC/$IDX.dat"
    rm -f "$LOC/$IDX.idn"
    "$JAVA" $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
    info "Build $IDX Completed"

    # Remove work file unless keeping
    if [ $KEEP_WORK = 0 ]; then
      debug "Cleaning up work file $WORK"
      rm "$WORK"
    fi
}

K1="-k 1,1"
K2="-k 2,2"
K3="-k 3,3"
K4="-k 4,4"

generate_index "$K1 $K2 $K3" "$DATA_TRIPLES" SPO

generate_index "$K2 $K3 $K1" "$DATA_TRIPLES" POS

generate_index "$K3 $K1 $K2" "$DATA_TRIPLES" OSP

generate_index "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO

generate_index "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS

generate_index "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP

generate_index "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG

generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG

generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG

info "Index Building Phase Completed"

# ---- Clean up.
if [ $KEEP_WORK = 0 ]; then
  debug "Cleaning up data files $DATA_TRIPLES and $DATA_QUADS"
  rm -f "$DATA_TRIPLES" "$DATA_QUADS"
fi
