###############################################################################
#                                                                             #
# Peekaboo Extended Email Attachment Behavior Observation Owl                 #
#                                                                             #
# 10-ask_peekaboo
###############################################################################
#                                                                             #
# Copyright (C) 2016-2022 science + computing ag                              #
#                                                                             #
# This program is free software: you can redistribute it and/or modify        #
# it under the terms of the GNU General Public License as published by        #
# the Free Software Foundation, either version 3 of the License, or (at       #
# your option) any later version.                                             #
#                                                                             #
# This program is distributed in the hope that it will be useful, but         #
# WITHOUT ANY WARRANTY; without even the implied warranty of                  #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU           #
# General Public License for more details.                                    #
#                                                                             #
# You should have received a copy of the GNU General Public License           #
# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
#                                                                             #
###############################################################################

use strict;

# use pure-perl implementation since performance should be enough for our needs
# and it's been a core module since 5.14, avoiding an additional dependency on
# JSON or JSON::XS
use JSON::PP;
# Yes, Amavis has its own minimalist JSON implementation Amavis::JSON. But it
# doesn't seem to do UTF-8 encoding correctly - everything ends up as latin1.

use LWP::UserAgent;
use URI::Escape;

# monkey patch a content_disposition setter/getter into the Parts package
*{Amavis::Unpackers::Part::content_disposition} = sub {
  @_<2 ? shift->{co_disp} : ($_[0]->{co_disp} = $_[1]);
};

# monkey patch a wrapper for mime_traverse into the MIME package
my $amavis_mime_traverse = \&Amavis::Unpackers::MIME::mime_traverse;
*{Amavis::Unpackers::MIME::mime_traverse} = sub ($$$$$) {
  my($entity, $tempdir, $parent_obj, $depth, $placement) = @_;

  &$amavis_mime_traverse($entity, $tempdir, $parent_obj, $depth, $placement);

  # go through list of children detected by call above and assign additional
  # properties
  foreach my $child (@{$parent_obj->children}) {
    $child->content_disposition($entity->head->mime_attr('content-disposition'));
  }
};

# stolen directly from mime_traverse() for reproducing its results
sub peekaboo_decode_filename($) {
  my($val_raw) = @_;

  my $val_dec = '';  # decoded, represented as native Perl characters
  eval {
    my(@chunks) = MIME::Words::decode_mimewords($val_raw);
    for my $pair (@chunks) {
      my($data,$encoding) = @$pair;
      if (!defined $encoding || $encoding eq '') {
        $val_dec .= safe_decode_latin1($data);  # assumes ISO-8859-1
      } else {
        $encoding =~ s/\*[^*]*\z//s;  # strip RFC 2231 language suffix
        $val_dec .= safe_decode($encoding,$data);
      }
    }
    1;
  };

  return $val_dec;
}

sub ask_peekaboo {
  my($dummy, $names_to_parts, $dummy, $av_name, $dummy, $args, $dummy,
     $dummy, $dummy) = @_;

  # we receive the url and potentially other parameters from @av_scanners (15-av_scanners)
  my $base_url = "http://127.0.0.1:8100";
  my $polling_interval = 5;

  if (defined($args)) {
    $base_url = $$args[0] if (defined($$args[0]));
    $polling_interval = $$args[1] if (defined($$args[1]));
  }

  my $scan_url = "${base_url}/v1/scan";
  my $report_url_tmpl = "${base_url}/v1/report/%d";

  # extract some additional info for peekaboo from the parts objects and
  # prepare and run http reuqests to the peekaboo server
  my $ua = LWP::UserAgent->new();
  my @jobs;
  my $result_text = "";
  foreach my $partname (keys %{$names_to_parts}) {
    my $part = $names_to_parts->{$partname};
    my $name_declared = $part->name_declared();

    # let's put on our thinking caps here: In the case of MIME parts (as opposed
    # to extracted archives which came from MIME parts), name_declared is
    # extracted from MIME part headers Content-Disposition and Content-Type by
    # sub mime_traverse() in that order.
    #
    # Decoding from MIME-encoding as per RFC1522/1867/2184/2231 is attempted.
    # If it is successful, the decoded value is added to an array. The original
    # raw value is added after that. Both are only added if they haven't been
    # seen before. Finally, name_declared is turned into a scalar if it has only
    # one element.
    #
    # A scalar name_declared will therefore always be a raw undecoded value
    # from Content-Disposition or Content-Type (only one of them present or
    # both with same value) that couldn't be decoded.
    #
    # An array is much more fluid because we can have a number of permutations
    # in there without a means to distinguish them:
    #
    # [ fn_cd_dec, fn_cd_raw, fn_ct_dec, fn_ct_raw ]
    # [ fn_cd_dec, fn_cd_raw ] # fn_ct_raw not present or == fn_cd_raw
    # [ fn_ct_dec, fn_ct_raw ]
    # [ fn_cd_dec, fn_cd_raw, fn_ct_raw ] # decoding failed or result == fn_cd_dec
    # [ fn_cd_raw, fn_ct_dec, fn_ct_raw ]
    # [ fn_cd_raw, fn_ct_raw ]
    #
    # What's the additional pity here is that a raw UTF-8 filename can (at least
    # sometimes) be decoded by AMaViS using latin1 into something worse but we
    # have no way to tell. :(
    my $decoded_filename = $name_declared;
    if (ref $name_declared eq 'ARRAY') {
      # start out with a best-effort default for all cases and try to improve from
      # there. First element either be decoded or raw:
      # - for four elements it'll be decoded,
      # - for two elements it'll either be decoded or raw. In the latter
      #   case there's nothing to improve.
      # - if AMaViS were to inspect more headers so the array and possible
      #   permutations grows, we're still best off using the first element and
      #   not mess with it
      $decoded_filename = $$name_declared[0];

      my $numnames = @$name_declared;
      if ($numnames == 3) {
        # default above would mean first element is decoded second element

        # last element is guaranteed to be raw
        if (peekaboo_decode_filename($$name_declared[2]) eq $$name_declared[1]) {
          # second element is actually decoded third element, i.e. filename from
          # Content-Disposition, so first element is raw, use second element
          # because it's more good[tm]
          $decoded_filename = $$name_declared[1];
        }
      }
    }

    # encode our hard-earned decoded filename for transfer as
    # multipart/form-data. We choose the method according to RFC2388 section
    # 4.4 (filename*=utf-8''<percent-encoded> derived from RFC2231) because
    # it's supported by Peekaboo via sanic. Passing it raw but UTF-8-encoded
    # (i.e. in form charset) is accepted practice as well and legitimated by
    # RFC7578 sections 4.2 and 5.1.3 but libwww-perl has been observed to
    # resort to latin1 and cannot be told otherwise (likely perl interpreter
    # default for file descriptor output - which we do not want to change
    # because it would affect all of AMaViS). Also, the HTML5 standard has gone
    # through various iterations of how to encode special characters such as
    # double quote, newline, linefeed and backslash over the years so that
    # there's currently total confusion within implementations what to do.
    # sanic for example decodes double quotes but not newlines or linefeeds as
    # the current version of the standard requires.
    #
    # To make matters somewhat more interesting, RFC7578 indirectly forbids
    # RFC2231 encoding via forbidding RFC5987. So currently our client will be
    # RFC2388- but not RFC7578-compliant. Looking at the encoding uncertainty
    # caused by HTML5 standard evolution this seems the lesser of two evils.
    # This could be revisited as we find a way to send out UTF-8-encoded
    # multipart/form-data from AMaViS and support for HTML5-style
    # percent-encoding stabilizes in client and server libraries (libwww-perl
    # and sanic in our case).
    #
    # Because RFC2388 is directly based on MIME, it requires encoding using
    # MIME mechanisms and we could re-use existing email handling modules.
    # Email::MIME::ContentType does not allow to encode just the filename
    # (flimsy argument, see below) and Encode::MIME:Header does not do RFC2231.
    # So we do it like python's email.utils.encode_rfc2231() does: Encode
    # using URI::Escape and add the structure around it ourselves. URI::Escape
    # is a dependency of LWP::UserAgent which we're using anyway.
    #
    # Because libwww-perl does not allow specifying a pre-encoded filename, we
    # circle back to building the whole Content-Disposition header ourselves if
    # a filename is to be supplied. Luckily, as per RFC5987 section 3.1 we do
    # not need to deal with continuations here.
    my $cd = undef;
    if (defined($name_declared)) {
      my $escaped_filename = uri_escape_utf8($decoded_filename);
      my $encoded_filename = "utf-8''${escaped_filename}";
      $cd = qq(form-data; name="file"; filename*="$encoded_filename")
    }

    my $answer = $ua->post($scan_url,
      Content_Type => 'form-data',
      # could be per-file but complicates handling server-side
      'X-Content-Disposition' => $part->content_disposition(),
      Content => [
        'file' => [
          $part->full_name(),
          # suppress deriving from input file name, we add a header with
          # encoded filename and asterisk for marking it as encoded ourselves
          '',
          Content_Type => $part->type_declared(),
          Content_Disposition => $cd,
        ]
      ]
    );

    my $code = $answer->code;
    if ($code != 200) {
      my $message = sprintf("Peekaboo: Received error %d when submitting %s",
              $code, $partname);
      Amavis::Util::do_log(1, $message);
      return (undef, $result_text . $message . "\n", undef);
    }

    # decode the text part of the response
    my $peekaboo_response = JSON::PP->new->utf8->decode(
            $answer->decoded_content);
    my $job = $peekaboo_response->{"job_id"};

    my $message = sprintf("Peekaboo: %s submitted as job %d", $partname, $job);
    Amavis::Util::do_log(5, "Peekaboo: " . $message);
    $result_text .= $message . "\n";

    push(@jobs, $job);
  }

  # poll the running jobs until all are done
  my @jobs_left = ();
  while (@jobs) {
    my $duration = $polling_interval;
    my $numjobs = $#jobs + 1;
    if (!@jobs_left) {
      # sleep a hundredth of the polling interval per job on the first loop in
      # case all jobs got answered from cached results and are ready very
      # quickly
      $duration = $polling_interval / 100 * $#jobs;
    }

    Amavis::Util::do_log(5, "Peekaboo: Sleeping %.2f seconds before " .
            "polling %d jobs again", $duration, $numjobs);
    sleep($duration);

    @jobs_left = ();
    for my $job (@jobs) {
      my $answer = $ua->get(sprintf($report_url_tmpl, $job));

      my $code = $answer->code;
      if ($code == 200) {
        my $message = sprintf("Peekaboo: Received report for job %d", $job);
        Amavis::Util::do_log(5, $message);
        $result_text .= $message . "\n";

        # final answer
        my $peekaboo_answer = JSON::PP->new->utf8->decode($answer->decoded_content);

        # extend result text with report and reason from this result
        my $report = $peekaboo_answer->{"report"};
        if (defined($report)) {
          $result_text .= join("\n", @{$report});
        }

        $result_text .= $peekaboo_answer->{"reason"} . "\n";

        # check for bad or failure first
        my $result = $peekaboo_answer->{"result"};
        if ($result eq "failed") {
          # no sense in dragging this out since one failed analysis means we
          # cannot reach any conclusive verdict
          my $message = sprintf("Peekaboo: Job %d failed", $job);
          Amavis::Util::do_log(1, $message);
          return (undef, $result_text . $message . "\n", undef)
        } elsif ($result eq "bad") {
          # same applies for one bad sample: we found malware, done
          my $message = sprintf("Peekaboo: Job %d found malware", $job);
          Amavis::Util::do_log(5, $message);
          return (1, $result_text . $message . "\n", undef)
        }

        # everything else means that this job did not find malware nor fail
      } elsif ($code == 404) {
        # result not yet present
        Amavis::Util::do_log(5, "Peekaboo: Job %d not finished yet", $job);
        push(@jobs_left, $job);
        next;
      } else {
        # all other codes indicate some kind of error
        my $message = sprintf("Peekaboo: Received error %d when " .
                "requesting report for job %d", $code, $job);
        Amavis::Util::do_log(1, $message);
        return (undef, $result_text . $message . "\n", undef);
      }
    }

    @jobs = @jobs_left;
  }

  # see amavisd-new routines run_av() and virus_scan() for the meaning of these
  # return values.
  my $message = sprintf("Peekaboo: No malware found");
  Amavis::Util::do_log(1, $message);
  return (0, $result_text . $message . "\n", undef);
}

1;  # ensure a defined return value
