#!/usr/bin/env python3
"""Scans for motifs given importance scores.

This program scans over the contribution scores you calculated with
:py:mod:`interpretFlat<bpreveal.interpretFlat>` and looks for matches to motifs
called by modiscolite. It can be run with a quantile JSON from
:py:mod:`motifSeqletCutoffs<bpreveal.motifSeqletCutoffs>`, or you can include
the settings for that program inside the JSON for this one, in which case it
will perform the seqlet analysis first and save those results for you. If you
include a ``seqlet-cutoff-settings`` block in the config, it will run the
:py:mod:`motifSeqletCutoffs<bpreveal.motifSeqletCutoffs>` tools, and if you
don't include that, you must include a ``seqlet-cutoff-json`` file with the
appropriate cutoffs. It is an error to specify both ``seqlet-cutoff-settings``
and ``seqlet-cutoff-json``.

Configuration Json
------------------

BNF
^^^


.. highlight:: none

.. literalinclude:: ../../doc/bnf/motifScan.bnf


Parameter Notes
^^^^^^^^^^^^^^^

scan-contrib-h5
    The output of :py:mod:`interpretFlat<bpreveal.interpretFlat>` and contains
    contribution scores. All of the regions in this file will be scanned.

num-threads
    The number of parallel workers to use.
    Due to the streaming architecture of this program, the minimum value of
    ``num-threads`` is 3. I have found that this program scales very well up to
    70 cores, and haven't tested it beyond that.

hits-tsv
    Where would you like the hit data stored?

seqlet-cutoff-settings
    See :py:mod:`motifSeqletCutoffs<bpreveal.motifSeqletCutoffs>` for the
    specification of this block.

Quantile json
-------------

If you don't run the seqlet cutoffs during the scan, you need to provide a JSON
file containing the information for each pattern. This file is generated by
:py:mod:`motifSeqletCutoffs<bpreveal.motifSeqletCutoffs>` and saved to the name
``quantile-json`` in the configuration to that script.

BNF
^^^


.. highlight:: none

.. literalinclude:: ../../doc/bnf/seqletQuantileCutoffs.bnf


Parameter notes
^^^^^^^^^^^^^^^

In the quantile JSON, we find the actual numerical cutoffs for scanning.

metacluster-name, pattern-name
    These are from the modisco hdf5 file.
short-name
    is a convenient name for this motif, and is
    entirely up to you.
    The short name will be used to populate the name column in the generated
    bed and csv files.
cwm
    An array of shape (length, NUM_BASES) that contains the cwm of
    the motif.
    It is used to calculate the Jaccard similarity and the L1 score.
pssm
    The sequence-based information content at each
    position, and is used to calculate sequence match scores.

seq-match-cutoff, contrib-match-cutoff, contrib-magnitude-cutoff
    The three cutoff values are the actual scores, *not quantile values*. These
    are calculated by
    :py:mod:`motifSeqletCutoffs<bpreveal.motifSeqletCutoffs>`. You could set
    these manually, but why would you?

    seq-match: Cutoff where a sequence must have a PSSM score higher than the
    original TF-MoDISco pattern seqlets' quantile value.

    contrib-match: Cutoff where a sequence must have a CWM score higher than the
    original TF-MoDISco pattern seqlets' quantile value.

    contrib-match: Cutoff where a sequence must have contribution (L1 magnitude)
    higher than the original TF-MoDISco pattern seqlets' quantile value.

    Note: Setting any of these cutoff values to 0 will mean that no value can
    be less that the lowest seqlet. Setting a cutoff to 0 is not the same as
    setting a cutoff to None.

Output Specification
--------------------

For the generated tsv file, see
:py:mod:`motifAddQuantiles<bpreveal.motifAddQuantiles>`. If you include a
``quantile-json`` in your ``seqlet-cutoff-settings``, then running
:py:mod:`motifScan<bpreveal.motifScan>` will save out the cutoff JSON.


API
---

"""
import json
import sys
import bpreveal.schema
from bpreveal import logUtils
from bpreveal import motifUtils
from bpreveal.internal import interpreter


def motifScan(config: dict) -> None:
    """Run the scan.

    :param config: A JSON object matching the motifScan specification.
    """
    logUtils.setVerbosity(config["verbosity"])
    if "seqlet-cutoff-settings" in config:
        assert "seqlet-cutoff-json" not in config, "You cannot name a " \
            "seqlet-cutoff-json to read in the config file if you also " \
            "specify seqlet-cutoff-settings."

        cutoffConfig = config["seqlet-cutoff-settings"]
        if "modisco-window" not in cutoffConfig:
            logUtils.error("Did not find a modisco window size. Reported"
                           "seqlet positions will be invalid.")
            cutoffConfig["modisco-window"] = 0
        logUtils.info("Modisco seqlet analysis requested. "
                      f"Starting with {cutoffConfig["modisco-h5"]}")
        # First, make the pattern objects.
        tsvFname = None
        if "seqlets-tsv" in cutoffConfig:
            tsvFname = cutoffConfig["seqlets-tsv"]
        scanPatternDict = motifUtils.seqletCutoffs(
            cutoffConfig["modisco-h5"],
            cutoffConfig["modisco-contrib-h5"],
            cutoffConfig["patterns"],
            cutoffConfig["seq-match-quantile"],
            cutoffConfig["contrib-match-quantile"],
            cutoffConfig["contrib-magnitude-quantile"],
            cutoffConfig["trim-threshold"],
            cutoffConfig["trim-padding"],
            cutoffConfig["background-probs"],
            cutoffConfig["modisco-window"],
            tsvFname
        )
        logUtils.info("Seqlet cutoffs determined.")
        if "quantile-json" in cutoffConfig:
            # You specified a quantile-json inside the cutoffs config.
            # Even though it isn't necessary since we just pass scanPatternDict
            # to the scanner directly, go ahead and save out the quantile json file.
            # In this case, save out the results of the seqlet analysis.
            logUtils.info(f"Saving pattern json to {cutoffConfig["quantile-json"]}")
            with open(cutoffConfig["quantile-json"], "w") as fp:
                json.dump(scanPatternDict, fp, indent=4)
    else:
        # We didn't have quantile-settings, so we'd better have quantile-json.
        # (In this case, we're reading quantile-json)
        logUtils.debug(f"Loading scanner parameters, using json {config["seqlet-cutoff-json"]}")
        with open(config["seqlet-cutoff-json"], "r") as fp:
            scanPatternDict = json.load(fp)

    scanConfig = config["scan-settings"]
    logUtils.info(f"Beginning motif scan on file {scanConfig["scan-contrib-h5"]}")
    motifUtils.scanPatterns(scanConfig["scan-contrib-h5"],
                            scanPatternDict,
                            scanConfig["hits-tsv"],
                            scanConfig["num-threads"])


def main() -> None:
    """A zero-argument wrapper around the main function."""
    configJson = interpreter.evalFile(sys.argv[1])
    assert isinstance(configJson, dict)
    bpreveal.schema.motifScan.validate(configJson)
    motifScan(configJson)


if __name__ == "__main__":
    main()
# Copyright 2022-2025 Charles McAnany. This file is part of BPReveal. BPReveal is free software: You can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. BPReveal is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with BPReveal. If not, see <https://www.gnu.org/licenses/>.  # noqa
