#!/usr/bin/env Rscript

## A script to read in functional annotation data and perform the
## functional enrichment analysis

## Amy Willis, March 2020

## Usage:
#         anvi-get-enriched-functions-per-pan-group --input input.txt --output output.txt
#
# This script is primarily used by the program `anvi-get-enriched-functions-per-pan-group`. But the input file format
# is expected to be like this:
#
#         COG_FUNCTION                                                                            function_accession  gene_clusters_ids                        associated_groups  p_HL  p_LL    N_HL  N_LL
#         NADPH-dependent glutamate synthase beta chain or related oxidoreductase                 COG0493             GC 00003904                              LL                 0     0.1818  20    11
#         Urease beta subunit                                                                     COG0832             GC 00000921                              HL                 0.95  0.4545  20    11
#         Phosphoribosyl-AMP cyclohydrolase                                                       COG0139             GC 00000450                                                 1     1       20    11
#         Predicted phage phi-C31 gp36 major capsid-like protein                                  COG4653             GC 00005619                              HL                 0.05  0       20    11
#         Cytochrome c biogenesis protein CcdA                                                    COG0785             GC 00000956                              HL                 1     0.2727  20    11
#         FoF1-type ATP synthase, epsilon subunit                                                 COG0355             GC 00000685                                                 1     1       20    11
#         Predicted mannosyl-3-phosphoglycerate phosphatase, HAD superfamily                      COG3769             GC 00001447,GC 00002275,GC 00003041      HL                 1     0.8182  20    11
#         Predicted flavoprotein YhiN                                                             COG2081             GC 00000967,GC 00002155                  HL                 1     0.9091  20    11
#         Phosphatidylserine/phosphatidylglycerophosphate/cardiolipin synthase or related enzyme  COG1502             GC 00003553                              LL                 0     0.0909  20    11
#         (...)                                                                                   (...)               (...)                                    (...)             (...)  (...)   (...) (...)
#
# Running this script on this file will add the following columns into it: enrichment_score, unadjusted_p_value, adjusted_q_value, associated_groups
#

## Example:
#
#         anvi-get-enriched-functions-per-pan-group --input input_TAB_delimited_file.txt --output output_tab_delimited_file.txt
#
## TODOs:
# how do anvio users want to do the following steps:
# installing BiocManager and then running `BiocManager::install("qvalue")`

## Load required packages
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(magrittr))

## If we got here, that means tidyverse was loaded, which means it
## must be installed.  This means tidyr must also be installed.  This
## is a sanity check for that.
if (isFALSE("tidyr" %in% rownames(installed.packages())) ||
    isFALSE("tidyr" %in% (.packages()))) {
  print("Error: tidyr not loaded.  Something went wrong")
  quit(save = "no", status = 1, runLast = FALSE)
}

## Figure out which version of tidyr is installed.
tidyr_version <- installed.packages()[(.packages()), ]["tidyr", "Version"]

## tidyr uses semantic versioning ie 1.0.0 and then possibly some
## stuff after.
matches <- regmatches(tidyr_version,
                      regexec("([0-9]+)\\.([0-9]+)\\.([0-9]+).*",
                              tidyr_version,
                              perl = TRUE))

## See if there were matches.
if (length(matches[[1]]) == 0) {
  warning("Could not determine tidyr version number accurately")
} else {
  ## Major version is the first matching group.
  ver_major <- matches[[1]][[2]]

  if (strtoi(ver_major) >= 1) {
    message("tidyr major version >= 1.  Using nest_legacy.")

    ## Need to use nest_legacy if using tidyr version >= 1.
    nest <- nest_legacy
  }
}

# command line options
option_list <- list(
  make_option(c("--output"),
              help = "Output file name"),
  make_option(c("--input"),
              help = "Input file name")
)
parser <- OptionParser(option_list = option_list,
                       description = "Hypothesis testing for 'functional enrichment' claims.")
arguments <- parse_args(parser,
                        positional_arguments = TRUE)
opts <- arguments$options

# check if the input file is accessible
if (file.access(opts$input) == -1) {
  stop(sprintf("Specified input file '%s' does not exist", opts$input))
}

# read in data
df_in <- readr::read_tsv(file = opts$input)

# Find number of groups
# Be smart about finding number of header rows
n_columns_before_data <- (names(df_in) %>% startsWith("p_") %>% which %>% min) - 1
G <- (ncol(df_in) - n_columns_before_data) / 2
# check if the input file makes sense
if (G %% 1 != 0) {
  stop(sprintf("The input file '%s' does not look like it came from anvi-get-enriched-functions-per-pan-group.", opt$input))
}

# Set up functions to run
run_test_no_spread <- function(df) {
  df %>%
    glm(cbind(x, N - x) ~ group, .,
        family = binomial(link = "logit")) %>%
    anova(test="Rao")
}
get_enrichment_score <- function(anova_output) {
  anova_output$Rao[2]
}
get_unadjusted_p_value <- function(anova_output) {
  ifelse(!is.na(anova_output$`Pr(>Chi)`[2]),
         anova_output$`Pr(>Chi)`[2],
         ifelse(anova_output$Rao[2] < 1e-3,
                1,
                stop("glm gave you a not-small test statistic but a missing p-value. This shouldn't happen! But it has, please submit an issue on our GitHub and tag @adw96 :)")))
}

# fit the GLMs in a nifty way using nest()
w_models <- df_in %>%
  gather(key = "type", value = "value", -c(1:n_columns_before_data)) %>%
  separate(type, into = c("type", "group"), sep = "_", extra = "merge") %>%
  spread(type, value) %>%
  mutate(x = N * p) %>%
  select(-p) %>%
  group_by(function_accession) %>%
  nest %>%
  mutate(model = map(data, run_test_no_spread))

# get the p-values & enrichment_scores
pvalues_df <- w_models %>%
  transmute(function_accession,
            "unadjusted_p_value" = map_dbl(model, get_unadjusted_p_value),
            "enrichment_score" = map_dbl(model, get_enrichment_score))

# find the maximum lambda for qvalue
# reasons for this change documented at https://github.com/merenlab/anvio/issues/1383
max_lambda <- min(0.95, max(pvalues_df$unadjusted_p_value, na.rm=T))

# get the q-values, then format the data and save
enrichment_output <- pvalues_df %>%
  mutate("adjusted_q_value" = qvalue::qvalue(unadjusted_p_value,
                                             lambda = seq(0.05, max_lambda, 0.05))$qvalues) %>%
  left_join(df_in, by = "function_accession") %>%
  select(names(df_in)[1],  # e.g. COG_FUNCTION
         enrichment_score,
         unadjusted_p_value,
         adjusted_q_value,
         associated_groups,
         function_accession,
         gene_clusters_ids,
         names(df_in)[n_columns_before_data + (1 : (G * 2))])

# Rao test statistic should always be positive. However,
# it happens that Rao test statistics are numerically negative
# but close to zero. If they are large and negative (say, smaller than
# 0.00001), we should throw an error:
if (min(enrichment_output$enrichment_score) < -1e-5) {
  message("A Rao test statistic is large and negative, oh my!")
  message("Here are the rows where the test statistic was negative:")

  enrichment_output %>%
    filter(enrichment_score < 0) %>%
    arrange(desc(enrichment_score)) %>%
    print(n = Inf)

  stop("Something has gone terribly wrong, and it's Amy's fault. Please let her know ASAP so she can fix it!")
}

# If they are small and negative, just set them to zero. Then,
# order the columns from largest to smallest enrichment score:
enrichment_output %<>%
  mutate("enrichment_score" = pmax(0, enrichment_score)) %>%
  arrange(desc(enrichment_score))

enrichment_output %>%
  write_tsv(path = opts$output)
