// Package configs provides various container-related configuration types
// used by libcontainer.
package configs

import (
	"bytes"
	"encoding/json"
	"errors"
	"fmt"
	"os/exec"
	"strconv"
	"strings"
	"time"
	"unsafe"

	"github.com/sirupsen/logrus"
	"golang.org/x/sys/unix"

	"github.com/opencontainers/cgroups"
	devices "github.com/opencontainers/cgroups/devices/config"
	"github.com/opencontainers/runtime-spec/specs-go"
)

type Rlimit struct {
	Type int    `json:"type"`
	Hard uint64 `json:"hard"`
	Soft uint64 `json:"soft"`
}

// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
	ContainerID int64 `json:"container_id"`
	HostID      int64 `json:"host_id"`
	Size        int64 `json:"size"`
}

// Seccomp represents syscall restrictions
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
	DefaultAction    Action                   `json:"default_action"`
	Architectures    []string                 `json:"architectures"`
	Flags            []specs.LinuxSeccompFlag `json:"flags"`
	Syscalls         []*Syscall               `json:"syscalls"`
	DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
	ListenerPath     string                   `json:"listener_path,omitempty"`
	ListenerMetadata string                   `json:"listener_metadata,omitempty"`
}

// Action is taken upon rule match in Seccomp
type Action int

const (
	Kill Action = iota + 1
	Errno
	Trap
	Allow
	Trace
	Log
	Notify
	KillThread
	KillProcess
)

// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int

const (
	EqualTo Operator = iota + 1
	NotEqualTo
	GreaterThan
	GreaterThanOrEqualTo
	LessThan
	LessThanOrEqualTo
	MaskEqualTo
)

// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
	Index    uint     `json:"index"`
	Value    uint64   `json:"value"`
	ValueTwo uint64   `json:"value_two"`
	Op       Operator `json:"op"`
}

// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
	Name     string `json:"name"`
	Action   Action `json:"action"`
	ErrnoRet *uint  `json:"errnoRet"`
	Args     []*Arg `json:"args"`
}

// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
	// This is a common option when the container is running in ramdisk.
	NoPivotRoot bool `json:"no_pivot_root,omitempty"`

	// ParentDeathSignal specifies the signal that is sent to the container's process in the case
	// that the parent process dies.
	ParentDeathSignal int `json:"parent_death_signal,omitempty"`

	// Path to a directory containing the container's root filesystem.
	Rootfs string `json:"rootfs"`

	// Umask is the umask to use inside of the container.
	Umask *uint32 `json:"umask,omitempty"`

	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
	// bind mounts are writtable.
	Readonlyfs bool `json:"readonlyfs,omitempty"`

	// Specifies the mount propagation flags to be applied to /.
	RootPropagation int `json:"rootPropagation,omitempty"`

	// Mounts specify additional source and destination paths that will be mounted inside the container's
	// rootfs and mount namespace if specified.
	Mounts []*Mount `json:"mounts"`

	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
	Devices []*devices.Device `json:"devices"`

	// NetDevices are key-value pairs, keyed by network device name, moved to the container's network namespace.
	NetDevices map[string]*LinuxNetDevice `json:"netDevices,omitempty"`

	MountLabel string `json:"mount_label,omitempty"`

	// Hostname optionally sets the container's hostname if provided.
	Hostname string `json:"hostname,omitempty"`

	// Domainname optionally sets the container's domainname if provided.
	Domainname string `json:"domainname,omitempty"`

	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
	// If a namespace is not provided that namespace is shared from the container's parent process.
	Namespaces Namespaces `json:"namespaces"`

	// Capabilities specify the capabilities to keep when executing the process inside the container
	// All capabilities not specified will be dropped from the processes capability mask.
	Capabilities *Capabilities `json:"capabilities,omitempty"`

	// Networks specifies the container's network setup to be created.
	Networks []*Network `json:"networks,omitempty"`

	// Routes can be specified to create entries in the route table as the container is started.
	Routes []*Route `json:"routes,omitempty"`

	// Cgroups specifies specific cgroup settings for the various subsystems that the container is
	// placed into to limit the resources the container has available.
	Cgroups *cgroups.Cgroup `json:"cgroups"`

	// AppArmorProfile specifies the profile to apply to the process running in the container and is
	// change at the time the process is executed.
	AppArmorProfile string `json:"apparmor_profile,omitempty"`

	// ProcessLabel specifies the label to apply to the process running in the container.  It is
	// commonly used by selinux.
	ProcessLabel string `json:"process_label,omitempty"`

	// Rlimits specifies the resource limits, such as max open files, to set in the container
	// If Rlimits are not set, the container will inherit rlimits from the parent process.
	Rlimits []Rlimit `json:"rlimits,omitempty"`

	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
	// for a process. Valid values are between the range [-1000, '1000'], where processes with
	// higher scores are preferred for being killed. If it is unset then we don't touch the current
	// value.
	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
	OomScoreAdj *int `json:"oom_score_adj,omitempty"`

	// UIDMappings is an array of User ID mappings for User Namespaces.
	UIDMappings []IDMap `json:"uid_mappings,omitempty"`

	// GIDMappings is an array of Group ID mappings for User Namespaces.
	GIDMappings []IDMap `json:"gid_mappings,omitempty"`

	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
	// mount pointing to /dev/null as to prevent reads of the file.
	MaskPaths []string `json:"mask_paths,omitempty"`

	// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
	// so that these files prevent any writes.
	ReadonlyPaths []string `json:"readonly_paths,omitempty"`

	// Sysctl is a map of properties and their values. It is the equivalent of using
	// sysctl -w my.property.name value in Linux.
	Sysctl map[string]string `json:"sysctl,omitempty"`

	// Seccomp allows actions to be taken whenever a syscall is made within the container.
	// A number of rules are given, each having an action to be taken if a syscall matches it.
	// A default action to be taken if no rules match is also given.
	Seccomp *Seccomp `json:"seccomp,omitempty"`

	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
	NoNewPrivileges bool `json:"no_new_privileges,omitempty"`

	// Hooks are a collection of actions to perform at various container lifecycle events.
	// CommandHooks are serialized to JSON, but other hooks are not.
	Hooks Hooks `json:"Hooks,omitempty"`

	// Version is the version of opencontainer specification that is supported.
	Version string `json:"version"`

	// Labels are user defined metadata that is stored in the config and populated on the state
	Labels []string `json:"labels"`

	// NoNewKeyring will not allocated a new session keyring for the container.  It will use the
	// callers keyring in this case.
	NoNewKeyring bool `json:"no_new_keyring,omitempty"`

	// IntelRdt specifies settings for Intel RDT group that the container is placed into
	// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
	IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`

	// MemoryPolicy specifies NUMA memory policy for the container.
	MemoryPolicy *LinuxMemoryPolicy `json:"memory_policy,omitempty"`

	// RootlessEUID is set when the runc was launched with non-zero EUID.
	// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
	// When RootlessEUID is set, runc creates a new userns for the container.
	// (config.json needs to contain userns settings)
	RootlessEUID bool `json:"rootless_euid,omitempty"`

	// RootlessCgroups is set when unlikely to have the full access to cgroups.
	// When RootlessCgroups is set, cgroups errors are ignored.
	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`

	// TimeOffsets specifies the offset for supporting time namespaces.
	TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`

	// Scheduler represents the scheduling attributes for a process.
	Scheduler *Scheduler `json:"scheduler,omitempty"`

	// Personality contains configuration for the Linux personality syscall.
	Personality *LinuxPersonality `json:"personality,omitempty"`

	// IOPriority is the container's I/O priority.
	IOPriority *IOPriority `json:"io_priority,omitempty"`

	// ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
	ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
}

// Scheduler is based on the Linux sched_setattr(2) syscall.
type Scheduler = specs.Scheduler

// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
	var policy uint32
	switch scheduler.Policy {
	case specs.SchedOther:
		policy = 0
	case specs.SchedFIFO:
		policy = 1
	case specs.SchedRR:
		policy = 2
	case specs.SchedBatch:
		policy = 3
	case specs.SchedISO:
		policy = 4
	case specs.SchedIdle:
		policy = 5
	case specs.SchedDeadline:
		policy = 6
	default:
		return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
	}

	var flags uint64
	for _, flag := range scheduler.Flags {
		switch flag {
		case specs.SchedFlagResetOnFork:
			flags |= 0x01
		case specs.SchedFlagReclaim:
			flags |= 0x02
		case specs.SchedFlagDLOverrun:
			flags |= 0x04
		case specs.SchedFlagKeepPolicy:
			flags |= 0x08
		case specs.SchedFlagKeepParams:
			flags |= 0x10
		case specs.SchedFlagUtilClampMin:
			flags |= 0x20
		case specs.SchedFlagUtilClampMax:
			flags |= 0x40
		default:
			return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
		}
	}

	return &unix.SchedAttr{
		Size:     unix.SizeofSchedAttr,
		Policy:   policy,
		Flags:    flags,
		Nice:     scheduler.Nice,
		Priority: uint32(scheduler.Priority),
		Runtime:  scheduler.Runtime,
		Deadline: scheduler.Deadline,
		Period:   scheduler.Period,
	}, nil
}

type IOPriority = specs.LinuxIOPriority

type CPUAffinity struct {
	Initial, Final *unix.CPUSet
}

// ToCPUSet parses a string in list format into a unix.CPUSet, e.g. "0-3,5,7-9".
func ToCPUSet(str string) (*unix.CPUSet, error) {
	if str == "" {
		return nil, nil
	}
	s := new(unix.CPUSet)

	// Since (*CPUset).Set silently ignores too high CPU values,
	// find out what the maximum is, and return an error.
	maxCPU := uint64(unsafe.Sizeof(*s) * 8)
	toInt := func(v string) (int, error) {
		ret, err := strconv.ParseUint(v, 10, 32)
		if err != nil {
			return 0, err
		}
		if ret >= maxCPU {
			return 0, fmt.Errorf("values larger than %d are not supported", maxCPU-1)
		}
		return int(ret), nil
	}

	for r := range strings.SplitSeq(str, ",") {
		// Allow extra spaces around.
		r = strings.TrimSpace(r)
		// Allow empty elements (extra commas).
		if r == "" {
			continue
		}
		if r0, r1, found := strings.Cut(r, "-"); found {
			start, err := toInt(r0)
			if err != nil {
				return nil, err
			}
			end, err := toInt(r1)
			if err != nil {
				return nil, err
			}
			if start > end {
				return nil, errors.New("invalid range: " + r)
			}
			for i := start; i <= end; i++ {
				s.Set(i)
			}
		} else {
			val, err := toInt(r)
			if err != nil {
				return nil, err
			}
			s.Set(val)
		}
	}
	if s.Count() == 0 {
		return nil, fmt.Errorf("no members found in set %q", str)
	}

	return s, nil
}

// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
	if sa == nil {
		return nil, nil
	}
	initial, err := ToCPUSet(sa.Initial)
	if err != nil {
		return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
	}
	final, err := ToCPUSet(sa.Final)
	if err != nil {
		return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
	}
	if initial == nil && final == nil {
		return nil, nil
	}

	return &CPUAffinity{
		Initial: initial,
		Final:   final,
	}, nil
}

type (
	HookName string
	HookList []Hook
	Hooks    map[HookName]HookList
)

const (
	// Prestart commands are executed after the container namespaces are created,
	// but before the user supplied command is executed from init.
	// Note: This hook is now deprecated
	// Prestart commands are called in the Runtime namespace.
	Prestart HookName = "prestart"

	// CreateRuntime commands MUST be called as part of the create operation after
	// the runtime environment has been created but before the pivot_root has been executed.
	// CreateRuntime is called immediately after the deprecated Prestart hook.
	// CreateRuntime commands are called in the Runtime Namespace.
	CreateRuntime HookName = "createRuntime"

	// CreateContainer commands MUST be called as part of the create operation after
	// the runtime environment has been created but before the pivot_root has been executed.
	// CreateContainer commands are called in the Container namespace.
	CreateContainer HookName = "createContainer"

	// StartContainer commands MUST be called as part of the start operation and before
	// the container process is started.
	// StartContainer commands are called in the Container namespace.
	StartContainer HookName = "startContainer"

	// Poststart commands are executed after the container init process starts.
	// Poststart commands are called in the Runtime Namespace.
	Poststart HookName = "poststart"

	// Poststop commands are executed after the container init process exits.
	// Poststop commands are called in the Runtime Namespace.
	Poststop HookName = "poststop"
)

// HasHook checks if config has any hooks with any given names configured.
func (c *Config) HasHook(names ...HookName) bool {
	if c.Hooks == nil {
		return false
	}
	for _, h := range names {
		if len(c.Hooks[h]) > 0 {
			return true
		}
	}
	return false
}

// KnownHookNames returns the known hook names.
// Used by `runc features`.
func KnownHookNames() []string {
	return []string{
		string(Prestart), // deprecated
		string(CreateRuntime),
		string(CreateContainer),
		string(StartContainer),
		string(Poststart),
		string(Poststop),
	}
}

type Capabilities struct {
	// Bounding is the set of capabilities checked by the kernel.
	Bounding []string `json:"Bounding,omitempty"`
	// Effective is the set of capabilities checked by the kernel.
	Effective []string `json:"Effective,omitempty"`
	// Inheritable is the capabilities preserved across execve.
	Inheritable []string `json:"Inheritable,omitempty"`
	// Permitted is the limiting superset for effective capabilities.
	Permitted []string `json:"Permitted,omitempty"`
	// Ambient is the ambient set of capabilities that are kept.
	Ambient []string `json:"Ambient,omitempty"`
}

func (hooks *Hooks) UnmarshalJSON(b []byte) error {
	var state map[HookName][]CommandHook

	if err := json.Unmarshal(b, &state); err != nil {
		return err
	}

	*hooks = Hooks{}
	for n, commandHooks := range state {
		if len(commandHooks) == 0 {
			continue
		}

		(*hooks)[n] = HookList{}
		for _, h := range commandHooks {
			(*hooks)[n] = append((*hooks)[n], h)
		}
	}

	return nil
}

func (hooks *Hooks) MarshalJSON() ([]byte, error) {
	serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
		for _, hook := range hooks {
			switch chook := hook.(type) {
			case CommandHook:
				serializableHooks = append(serializableHooks, chook)
			default:
				logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
			}
		}

		return serializableHooks
	}

	return json.Marshal(map[string]any{
		"prestart":        serialize((*hooks)[Prestart]),
		"createRuntime":   serialize((*hooks)[CreateRuntime]),
		"createContainer": serialize((*hooks)[CreateContainer]),
		"startContainer":  serialize((*hooks)[StartContainer]),
		"poststart":       serialize((*hooks)[Poststart]),
		"poststop":        serialize((*hooks)[Poststop]),
	})
}

// Run executes all hooks for the given hook name.
func (hooks Hooks) Run(name HookName, state *specs.State) error {
	list := hooks[name]
	for i, h := range list {
		if err := h.Run(state); err != nil {
			return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
		}
	}

	return nil
}

// SetDefaultEnv sets the environment for those CommandHook entries
// that do not have one set.
func (hooks HookList) SetDefaultEnv(env []string) {
	for _, h := range hooks {
		if ch, ok := h.(CommandHook); ok && len(ch.Env) == 0 {
			ch.Env = env
		}
	}
}

type Hook interface {
	// Run executes the hook with the provided state.
	Run(*specs.State) error
}

// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(*specs.State) error) FuncHook {
	return FuncHook{
		run: f,
	}
}

type FuncHook struct {
	run func(*specs.State) error
}

func (f FuncHook) Run(s *specs.State) error {
	return f.run(s)
}

type Command struct {
	Path    string         `json:"path"`
	Args    []string       `json:"args"`
	Env     []string       `json:"env"`
	Dir     string         `json:"dir"`
	Timeout *time.Duration `json:"timeout"`
}

// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd *Command) CommandHook {
	return CommandHook{
		Command: cmd,
	}
}

type CommandHook struct {
	*Command
}

func (c *Command) Run(s *specs.State) error {
	b, err := json.Marshal(s)
	if err != nil {
		return err
	}
	var stdout, stderr bytes.Buffer
	cmd := exec.Cmd{
		Path:   c.Path,
		Args:   c.Args,
		Env:    c.Env,
		Stdin:  bytes.NewReader(b),
		Stdout: &stdout,
		Stderr: &stderr,
	}
	if err := cmd.Start(); err != nil {
		return err
	}
	errC := make(chan error, 1)
	go func() {
		err := cmd.Wait()
		if err != nil {
			err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
		}
		errC <- err
	}()
	var timerCh <-chan time.Time
	if c.Timeout != nil {
		timer := time.NewTimer(*c.Timeout)
		defer timer.Stop()
		timerCh = timer.C
	}
	select {
	case err := <-errC:
		return err
	case <-timerCh:
		_ = cmd.Process.Kill()
		<-errC
		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
	}
}
