#!/usr/bin/env bash

# Record start time
start_time=$(date +%s)

# Initialize variables
RAW_SEQ_DIR=""  # Directory containing raw sequence files
OUTPUT_DIR=""  # Specify the output directory for the results
DATABASE=""  # Path to the database required for analysis, to be specified by the user
SAMPLETYPE=""  # Specify the sample type: DNA, RNA, or Mix. Default is empty.
ASSEMBLY_SOFTWARE=""  # Specify the assembly software to be used, e.g., "megahit" or "metaspades"
REASSEMBLE=false
VERSION="0.4.2"
CONCENTRATION_TYPE=""  # Specify the concentration type: non-concentration or concentration. Default is empty.
THREADS=0  # Default number of threads
LOG_FILE=""

# Capture SIGINT (Ctrl+C) and SIGTERM signals only in the parent process
trap 'cleanup' SIGINT SIGTERM

cleanup_flag=false

cleanup() {
  if [ "$cleanup_flag" = true ]; then
    return
  fi
  cleanup_flag=true
  [ -t 1 ] && tput cnorm
  if [ "$$" -eq "$(ps -o pgid= $$)" ]; then
    log_with_timestamp "Caught termination signal. Cleaning up..."
    kill -- -$$
  fi
  exit 1
}


# Function to add a timestamp to log entries
log_with_timestamp() {
  echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

# Parameter validation function
validate_param() {
  local param="$1"
  local valid_values="$2"
  if [[ ! " ${valid_values[@]} " =~ " ${param} " ]]; then
    echo "Error: Invalid ${param}. Please choose from: ${valid_values[@]}."
    exit 1
  fi
}

# Use getopts to handle short options and manually handle long options
while [[ $# -gt 0 ]]; do
  case $1 in
    -r)
      RAW_SEQ_DIR=$2
      shift 2
      ;;
    -o)
      OUTPUT_DIR=$2
      LOG_FILE="${OUTPUT_DIR}/pipeline.log"
      if [ ! -f "$LOG_FILE" ]; then
        mkdir -p "$(dirname "$LOG_FILE")"
        touch "$LOG_FILE"
      fi
      shift 2
      ;;
    -d)
      DATABASE=$2
      shift 2
      ;;
    -a)
      ASSEMBLY_SOFTWARE=$2
      shift 2
      ;;
    -n)
      THREADS=$2
      shift 2
      ;;
    -v)
      echo "Version: $VERSION"
      exit 0
      ;;
    --non-con)
      CONCENTRATION_TYPE="non-concentration"
      shift
      ;;
    --con)
      CONCENTRATION_TYPE="concentration"
      shift
      ;;
    --reassemble)
      REASSEMBLE=true
      shift
      ;;
    *)
      echo "Unknown option: $1" >&2
      exit 1
      ;;
    -h)
      echo "Usage: $0 [options]"
      echo ""
      echo "Options:"
      echo "  -r <input_path_raw_seqs>    Specify the input directory to search for raw seqs files."
      echo "  -o <output_path>            Specify the output directory for the results."
      echo "  -d <database_path>          Specify the path to the database required for analysis."
      echo "  -a <assembly_software>      Specify the assembly software: megahit, metaspades."
      echo "  -n <threads>                Specify the number of threads to use (default: Use max available cores)."
      echo "  --non-con                   Specify non-concentration processing."
      echo "  --con                       Specify concentration processing."
      echo "  --reassemble                Enable reassembly of bins."
      echo "  -v                          Display the version of this script."
      echo "  -h                          Display this help and exit."
      exit 0
      ;;    
  esac
done

# Determine the number of threads per file based on the THREADS input
if [ "$THREADS" -eq 0 ]; then
  THREADS=$(nproc)
else
  THREADS="$THREADS"
fi

# Check if necessary inputs are provided
if [ -z "$RAW_SEQ_DIR" ] || [ -z "$OUTPUT_DIR" ] || [ -z "$DATABASE" ] || [ -z "$CONCENTRATION_TYPE" ] || [ -z "$ASSEMBLY_SOFTWARE" ]; then
    echo "Usage: $0 -r <input_path_raw_seqs> -o <output_path> -d <database_path> -a <assembly_software> -n <threads> --non-con/--con [--reassemble]"
    exit 1
fi

# Validate assembly software option
validate_param "$ASSEMBLY_SOFTWARE" "megahit metaspades"

# Validate concentration type
validate_param "$CONCENTRATION_TYPE" "non-concentration concentration"

# Log the command with a timestamp
log_with_timestamp "Run ViOTUcluster as command: $0 $@"

# Log all environment variables with a timestamp
log_with_timestamp "Environment variables at start:"

run_module() {
  local module_name="$1"
  local log_file="$2"
  local command="$3"

  mkdir -p "$(dirname "$log_file")"

  echo "Starting $module_name..."
  log_with_timestamp "Starting $module_name..."

  local spin='-\|/'
  local i=0
  [ -t 1 ] && tput civis

  (
    while true; do
      i=$(( (i+1) % 4 ))
      [ -t 1 ] && printf "\rRunning %s... %s" "$module_name" "${spin:$i:1}"
      sleep 0.1
    done
  ) &

  local spinner_pid=$!

  local module_start_time
  module_start_time=$(date +%s)

  # 执行命令，将输出按行读取，给每行添加时间戳后同时输出到终端和日志文件
  eval "stdbuf -oL $command" 2>&1 | while IFS= read -r line; do
      timestamped_line="$(date '+%Y-%m-%d %H:%M:%S') - $line"
      echo "$timestamped_line"                   # 输出到终端
      echo "$timestamped_line" >> "$log_file"      # 同时写入日志文件
  done
  local result=${PIPESTATUS[0]}

  kill "$spinner_pid" 2>/dev/null
  wait "$spinner_pid" 2>/dev/null
  [ -t 1 ] && tput cnorm
  [ -t 1 ] && printf "\r\033[K"

  if [ "$result" -ne 0 ]; then
    echo "$module_name failed. Check log: $log_file"
    log_with_timestamp "Error: $module_name failed. Check log: $log_file"
    cleanup
  fi

  local module_end_time
  module_end_time=$(date +%s)
  local module_runtime=$((module_end_time - module_start_time))

  echo "$module_name completed in ${module_runtime} seconds."
  log_with_timestamp "$module_name completed in ${module_runtime} seconds."
}

# Check if paired files exist and have the correct format
for FILE in "${RAW_SEQ_DIR}"/*_R1.*; do
  BASENAME=$(basename "$FILE" | sed 's/_R1\..*//')
  PREFIX="${RAW_SEQ_DIR}/${BASENAME}"

  if [ -f "${PREFIX}_R1.fq" ] && [ -f "${PREFIX}_R2.fq" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_R1.fq and ${PREFIX}_R2.fq"
  elif [ -f "${PREFIX}_R1.fastq" ] && [ -f "${PREFIX}_R2.fastq" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_R1.fastq and ${PREFIX}_R2.fastq"
  elif [ -f "${PREFIX}_R1.fq.gz" ] && [ -f "${PREFIX}_R2.fq.gz" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_R1.fq.gz and ${PREFIX}_R2.fq.gz"
  elif [ -f "${PREFIX}_R1.fastq.gz" ] && [ -f "${PREFIX}_R2.fastq.gz" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_R1.fastq.gz and ${PREFIX}_R2.fastq.gz"
  elif [ -f "${PREFIX}_1.fq" ] && [ -f "${PREFIX}_2.fq" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_1.fq and ${PREFIX}_2.fq"
  elif [ -f "${PREFIX}_1.fastq" ] && [ -f "${PREFIX}_2.fastq" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_1.fastq and ${PREFIX}_2.fastq"
  elif [ -f "${PREFIX}_1.fq.gz" ] && [ -f "${PREFIX}_2.fq.gz" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_1.fq.gz and ${PREFIX}_2.fq.gz"
  elif [ -f "${PREFIX}_1.fastq.gz" ] && [ -f "${PREFIX}_2.fastq.gz" ]; then
    log_with_timestamp "Found paired files: ${PREFIX}_1.fastq.gz and ${PREFIX}_2.fastq.gz"
  else
    log_with_timestamp "Error: Paired-end files for ${BASENAME} not found in the expected formats (.fq, .fastq, .fq.gz, .fastq.gz)"
    exit 1
  fi
done

# Process with concentration type
log_with_timestamp "Processing with $CONCENTRATION_TYPE mode and $THREADS threads."

# Get bin folder location of current Conda environment
if [ -z "$CONDA_PREFIX" ]; then
  log_with_timestamp "Conda environment is not activated."
  exit 1
fi
ScriptDir="${CONDA_PREFIX}/bin"

# Export parameters as environment variables
export RAW_SEQ_DIR OUTPUT_DIR DATABASE SAMPLETYPE REASSEMBLE ASSEMBLY_SOFTWARE ScriptDir

mkdir -p "${OUTPUT_DIR}/Log"

# Run each module and check for errors
run_module "Raw sequences Process" "${OUTPUT_DIR}/Log/Preprocess.log" "Preprocess_module.sh '${RAW_SEQ_DIR}' '${ASSEMBLY_SOFTWARE}' '${OUTPUT_DIR}' '${THREADS}'"

# Update exported parameters
CONTIGS_DIR="${OUTPUT_DIR}/Contigs"
RAW_SEQ_DIR="${OUTPUT_DIR}/Cleanreads"
INPUT_DIR=${CONTIGS_DIR}

# Export parameters as environment variables
export INPUT_DIR OUTPUT_DIR DATABASE SAMPLETYPE REASSEMBLE ScriptDir RAW_SEQ_DIR THREADS

# Perform different processing based on CONCENTRATION_TYPE
if [ "$CONCENTRATION_TYPE" == "non-concentration" ]; then
  ViOTUcluster -i "${INPUT_DIR}" -r "${RAW_SEQ_DIR}" -o "${OUTPUT_DIR}" -d "${DATABASE}" -n "${THREADS}" --non-con
else
  ViOTUcluster -i "${INPUT_DIR}" -r "${RAW_SEQ_DIR}" -o "${OUTPUT_DIR}" -d "${DATABASE}" -n "${THREADS}" --con
fi
