diff --git a/Makefile b/Makefile index 03ab3428..93cc05de 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,16 @@ CEK_DIRECTORIES_WITH_SHELL_FILES ?= roles/ examples/ playbooks/infra/ playbooks/intel/ -ARCH ?= 'icx' +ARCH ?= 'spr' NIC ?= 'cvl' MIRRORS ?= false PLAYBOOKS_DIRS = playbooks playbooks/infra playbooks/intel -PLAYBOOK_NAMES = access basic full_nfv on_prem on_prem_vss on_prem_sw_defined_factory on_prem_aibox regional_dc remote_fp build_your_own +PLAYBOOK_NAMES = access basic base_video_analytics full_nfv on_prem on_prem_vss on_prem_sw_defined_factory on_prem_aibox regional_dc remote_fp build_your_own + +USERNAME = 'root' # set default target available with simple 'make' command .DEFAULT_GOAL := examples -.PHONY: shellcheck ansible-lint all-profiles clean clean-playbooks help k8s-profiles vm-profiles cloud-profiles +.PHONY: shellcheck ansible-lint all-profiles clean clean-playbooks help k8s-profile vm-profile cloud-profile auto-k8s-profile auto-vm-profile auto-cloud-profile shellcheck: find $(CEK_DIRECTORIES_WITH_SHELL_FILES) -type f \( -name '*.sh' -o -name '*.bash' -o -name '*.ksh' -o -name '*.bashrc' -o -name '*.bash_profile' -o -name '*.bash_login' -o -name '*.bash_logout' \) \ @@ -17,20 +19,34 @@ shellcheck: ansible-lint: ansible-lint playbooks/* roles/* -c .ansible-lint -# make sure PROFILE is set to an 'all_examples' string for 'examples' and empty target -ifeq ($(MAKECMDGOALS), $(filter $(MAKECMDGOALS),examples '')) +# make sure PROFILE is set to an 'all_examples' string for 'examples', 'auto-examples' and empty target +ifeq ($(MAKECMDGOALS), $(filter $(MAKECMDGOALS),examples auto-examples '')) override PROFILE = 'all_examples' endif # make sure PROFILE is defined for mode-related targets ifndef PROFILE -ifeq ($(MAKECMDGOALS), $(filter $(MAKECMDGOALS),k8s-profile vm-profile cloud-profile)) +ifeq ($(MAKECMDGOALS), $(filter $(MAKECMDGOALS),k8s-profile vm-profile cloud-profile auto-k8s-profile auto-vm-profile auto-cloud-profile)) $(error please specify which profile should be generated, e.g. PROFILE=basic. Run 'make help' for more information.) endif endif +ifdef MAKECMDGOALS +ifeq ($(MAKECMDGOALS), $(filter $(MAKECMDGOALS),auto-k8s-profile auto-vm-profile auto-cloud-profile auto-examples)) +ifndef HOSTS +$(error please set machines IPs for auto-detection, e.g. HOSTS=a.a.a.a,b.b.b.b. Run 'make help' for more information.) +endif +RESULT = $(shell python3 ./scripts/autodetect_arch_and_nic_type.py -m $(HOSTS) -u $(USERNAME) || { echo >&2 "Unable to auto-detect ARCH and NIC. Exiting."; kill $$PPID; }) +ARCH = $(word 1,$(subst ;, ,$(RESULT))) +NIC = $(word 2,$(subst ;, ,$(RESULT))) +$(info Autodetected ARCH=$(ARCH) NIC=$(NIC)) +endif +endif + examples: k8s-profile vm-profile cloud-profile +auto-examples: auto-k8s-profile auto-vm-profile auto-cloud-profile + k8s-profile: clean-playbooks python3 generate/render.py \ --config generate/profiles_templates/k8s/profiles.yml \ @@ -44,6 +60,8 @@ k8s-profile: clean-playbooks -n ${NIC} \ -m ${MIRRORS} +auto-k8s-profile: k8s-profile + vm-profile: clean-playbooks python3 generate/render.py \ --config generate/profiles_templates/vm/vm_host_profiles.yml \ @@ -58,6 +76,8 @@ vm-profile: clean-playbooks -n ${NIC} \ -m ${MIRRORS} +auto-vm-profile: vm-profile + cloud-profile: clean-playbooks python3 generate/render.py \ --config generate/profiles_templates/cloud/profiles.yml \ @@ -71,38 +91,44 @@ cloud-profile: clean-playbooks -n ${NIC} \ -m ${MIRRORS} +auto-cloud-profile: cloud-profile + clean: clean-playbooks clean-project-root-dir clean-backups: rm -rf backups clean-project-root-dir: - rm -rf examples host_vars group_vars inventory.ini + rm -rf examples host_vars group_vars inventory.ini .nic-pci-*.yml .qat-pci-*.yml clean-playbooks: for d in $(PLAYBOOKS_DIRS) ; do for n in $(PLAYBOOK_NAMES) ; do rm -f $$d/$$n.yml ; done done help: @echo "Cleaning targets:" - @echo " clean - removes examples directory," - @echo " all host_vars and group_vars dirs," - @echo " inventory files and playbooks" + @echo " clean - removes examples directory," + @echo " all host_vars and group_vars dirs," + @echo " inventory files and playbooks" @echo "" - @echo " clean-backups - clean generated backup files." + @echo " clean-backups - clean generated backup files." @echo "" @echo "Genertare example profiles:" - @echo " make, examples - generate sample files of all available profiles." + @echo " make, examples - generate sample files of all available profiles." @echo "" @echo "Generating k8s profile:" - @echo " k8s-profile PROFILE= - generate files required for deployment of specific profile in k8s mode." + @echo " k8s-profile PROFILE= - generate files required for deployment of specific profile in k8s mode." + @echo " auto-k8s-profile PROFILE=" @echo "" @echo "Generating VM profile:" - @echo " vm-profile PROFILE= - generate files required for deployment of specific profile in vm mode." + @echo " vm-profile PROFILE= - generate files required for deployment of specific profile in vm mode." + @echo " auto-vm-profile PROFILE=" @echo "" @echo "Generating Cloud profile:" - @echo " cloud-profile PROFILE= - generate files required for deployment of specific profile in cloud mode." + @echo " cloud-profile PROFILE= - generate files required for deployment of specific profile in cloud mode." + @echo " auto-cloud-profile PROFILE=" @echo "" @echo "For more information about:" + @echo " - architecture and ethernet network adapter auto-detection" @echo " - profiles generation" @echo " - supported architectures" @echo " - available profiles" diff --git a/Pipfile b/Pipfile index e9e73ce7..4f3c8001 100644 --- a/Pipfile +++ b/Pipfile @@ -4,22 +4,22 @@ verify_ssl = true name = "pypi" [packages] -ansible = "~=7.7.0" -"ansible-core" = "~=2.14" +ansible = "~=8.6.1" +"ansible-core" = "~=2.15" cryptography = "~=41.0" jinja2 = "~=3.1" -netaddr = "~=0.8.0" -pbr = "~=5.11" +netaddr = "~=0.9.0" +pbr = "~=6.0" jmespath = "~=1.0.1" -"ruamel.yaml" = "~=0.17.32" -"ruamel.yaml.clib" = "~=0.2.7" +"ruamel.yaml" = "~=0.17.40" +"ruamel.yaml.clib" = "~=0.2.8" MarkupSafe = "~=2.1" -ipaddr = "*" [dev-packages] -ansible-lint = "~=6.12.2" -pylint = "~=2.17.5" +ansible-lint = "~=6.21.1" +pylint = "~=3.0.2" bandit = "~=1.7.5" +licenseheaders = "~=0.8.8" [requires] python_version = "3" diff --git a/README.md b/README.md index fc4e8911..37021bb6 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,12 @@ The software provided here is for reference only and not intended for production export PROFILE=access ``` + - For **Kubernetes Edge Ready Infrastructure** deployment: + + ```bash + export PROFILE=base_video_analytics + ``` + - For **Kubernetes Regional Data Center Infrastructure** deployment: ```bash @@ -99,15 +105,26 @@ The software provided here is for reference only and not intended for production ansible-galaxy install -r collections/requirements.yml ``` -4. Generate example host_vars, group_vars and inventory files for Intel Container Experience Kits profiles. +4. Copy SSH key to all Kubernetes nodes or VM hosts you are going to use. + + ```bash + ssh-copy-id @ + ``` + +5. Generate example host_vars, group_vars and inventory files for Intel Container Experience Kits profiles. > **_NOTE:_** It is **highly recommended** to read [this](docs/generate_profiles.md) file before profiles generation. + Architecture and Ethernet Network Adapter type can be auto-discovered: + ```bash + make auto-examples HOSTS=X.X.X.X,X.X.X.X USERNAME= + ``` + or specified manually: ```bash - make examples ARCH= NIC= + make examples ARCH= NIC= ``` -5. Copy example inventory file to the project root dir. +6. Copy example inventory file to the project root dir. ```bash cp examples/k8s/${PROFILE}/inventory.ini . @@ -121,7 +138,7 @@ The software provided here is for reference only and not intended for production > **_NOTE:_** For cloud profiles no inventory.ini file is created, as it will be generated during machine provisioning. As a result, step 6 can be skipped. -6. Update inventory file with your environment details. +7. Update inventory file with your environment details. For VM case: update details relevant for vm_host @@ -133,7 +150,7 @@ The software provided here is for reference only and not intended for production In `all_system_facts.txt` file you will find details about your hardware, operating system and network interfaces, which will help to properly configure Ansible variables in the next steps. -7. Copy group_vars and host_vars directories to the project root dir. +8. Copy group_vars and host_vars directories to the project root dir. ```bash cp -r examples/k8s/${PROFILE}/group_vars examples/k8s/${PROFILE}/host_vars . @@ -151,7 +168,7 @@ The software provided here is for reference only and not intended for production cp -r examples/cloud/${PROFILE}/group_vars examples/cloud/${PROFILE}/host_vars . ``` -8. Update group and host vars to match your desired configuration. Refer to [this section](#configuration) for more details. +9. Update group and host vars to match your desired configuration. Refer to [this section](#configuration) for more details. > **_NOTE:_** Please pay special attention to the `http_proxy`, `https_proxy` and `additional_no_proxy` vars if you're behind proxy. @@ -165,13 +182,13 @@ The software provided here is for reference only and not intended for production Needed details are at least dataplane_interfaces For more details see [VM case configuration guide](docs/vm_config_guide.md) -9. **Mandatory:** Apply patch for Kubespray collection. +10. **Mandatory:** Apply patch for Kubespray collection. ```bash ansible-playbook -i inventory.ini playbooks/k8s/patch_kubespray.yml ``` -10. Execute `ansible-playbook`. +11. Execute `ansible-playbook`. > **_NOTE:_** For Cloud case this step is not used. See the [cloud/](cloud/) directory for more details @@ -194,6 +211,10 @@ The software provided here is for reference only and not intended for production > **_NOTE:_** VMs are accessible from ansible host via ssh vm-ctrl-1 or ssh vm-work-1 +## Cleanup + +Refer to the [documentation](docs/redeploy_cleanup.md) to see details about how to cleanup existing deployment or specific feature. + ## Configuration Refer to the documentation linked below to see configuration details for selected capabilities and deployment profiles. diff --git a/action_plugins/cpupin.py b/action_plugins/cpupin.py index fb42b07b..08d8f8c1 100644 --- a/action_plugins/cpupin.py +++ b/action_plugins/cpupin.py @@ -23,10 +23,13 @@ import re import random import copy +import json +import os.path from ansible.module_utils._text import to_native from ansible.plugins.action import ActionBase from ansible.errors import AnsibleActionFail +from ansible.utils.display import Display # Minimum required vCPUs for the VM MINIMUM_VCPUS = 2 @@ -35,6 +38,10 @@ # Minimum required vCPUs for host OS MINIMUM_HOST_OS_VCPUS = 2 +# Directory to store cpupin states +STORE_DIR = os.path.expanduser('~') + "/.cpupin/" + +display = Display() class ActionModule(ActionBase): """cpupin action plugin implementation""" @@ -45,6 +52,10 @@ def __init__(self, task, connection, play_context, loader, templar, shared_loade # Example: [[0, 1, 2, 3, 4, 5, 6, 7], [44, 45, 46, 47, 48, 49, 50, 51]] # list self.host_os_cpus = [] + # Dictionary with CPUs allocated for host OS and corresponding NUMA node + # Example: {'node0': [[0, 1, 2, 3, 4, 5, 6, 7], [44, 45, 46, 47, 48, 49, 50, 51]]} + # dict + self.host_os_cpus_dict = {} # Dictionary with all NUMA nodes available on host and their respective CPUs # (currently available) # Example: {'node0': [[0, 1, ... , 21],[44, 45, ... , 65]], 'node1': [[22, 23, ... , 43], @@ -63,6 +74,9 @@ def __init__(self, task, connection, play_context, loader, templar, shared_loade # Number of unallocated CPUs # int self.unallocated_cpus = 0 + # NUMA node for unallocated CPUs + # int + self.unallocated_numa = None # Set if some change happened during module run # bool self.changed = False @@ -90,6 +104,9 @@ def __init__(self, task, connection, play_context, loader, templar, shared_loade # Do we really do CPU pinning, or just allocate CPUs? # bool self.pinning = False + # String with VM host name + # str + self.host_name = None # Return values # dict self.result = {'changed': self.changed, @@ -106,6 +123,10 @@ def __init__(self, task, connection, play_context, loader, templar, shared_loade # Number of cpus in list of allocate CPUs (self.cpu_list) # int self.cpu_list_count = None + # Dictionary with sellected CPUs and corresponding NUMA node + # Example: {'node0': [[8, 9, ...],[44, 45, ...]]} + # dict + self.cpu_list_dict = {} # CPUs selected for emulator pinning # Example: [8, 44] # list @@ -119,6 +140,8 @@ def run(self, tmp=None, task_vars=None): module_args=module_args, task_vars=task_vars, tmp=tmp) + display.vv("start cpupin plugin") + self.name = self._task.args.get('name', None) self.number = self._task.args.get('number', None) self.cpus = self._task.args.get('cpus', None) @@ -126,30 +149,71 @@ def run(self, tmp=None, task_vars=None): self.number_host_os = self._task.args.get('number_host_os', HOST_OS_VCPUS) self.alloc_all = self._task.args.get('alloc_all', False) self.pinning = self._task.args.get('pinning', None) - - # Initialise numa_nodes key - if 'numa_nodes' not in task_vars: - task_vars['numa_nodes'] = {} - # Initialise numa_nodes_cpus key - if 'numa_nodes_cpus' not in task_vars: - task_vars['numa_nodes_cpus'] = {} - # Initialise numa_nodes_cpus_orig key - if 'numa_nodes_cpus_orig' not in task_vars: - task_vars['numa_nodes_cpus_orig'] = {} - # Initialise host_os_cpus key - if 'host_os_cpus' not in task_vars: - task_vars['host_os_cpus'] = {} + self.host_name = self._task.args.get('host_name', None) + display.vv("running cpupin plugin with args: 'name=%s, number=%s, cpus=%s, numa=%s, number_host_os=%s, alloc_all=%s, pinning=%s, host_name=%s" % + (self.name, self.number, self.cpus, self.numa, self.number_host_os, self.alloc_all, self.pinning, self.host_name)) msg = "" + if self.name is None: + msg = "'name' parameter is required" + + if self.host_name is None: + msg = "'host_name' parameter is required" + + if msg: + raise AnsibleActionFail(msg) + + numa_nodes_path=STORE_DIR + self.host_name + "_numa_nodes" + numa_nodes_cpus_path=STORE_DIR + self.host_name + "_numa_nodes_cpus" + numa_nodes_cpus_orig_path=STORE_DIR + self.host_name + "_numa_nodes_cpus_orig" + host_os_cpus_path=STORE_DIR + self.host_name + "_host_os_cpus" + + vm_cpus_path=STORE_DIR + self.host_name + "_" + self.name + + if not os.path.exists(STORE_DIR): + os.makedirs(STORE_DIR) + + if os.path.isfile(numa_nodes_path): + with open(numa_nodes_path, "r") as fp_nn: + # Load the dictionary from the file + self.numa_nodes = json.load(fp_nn) + display.vv("initialize numa_nodes from file") + display.vvv("read numa_nodes from file: %s -> %s" % (numa_nodes_path, self.numa_nodes)) + + if os.path.isfile(numa_nodes_cpus_path): + with open(numa_nodes_cpus_path, "r") as fp_nnc: + # Load the dictionary from the file + self.numa_nodes_cpus = json.load(fp_nnc) + display.vv("initialize numa_nodes_cpus from file") + display.vvv("read numa_nodes_cpus from file: %s -> %s" % (numa_nodes_cpus_path, self.numa_nodes_cpus)) + + if os.path.isfile(numa_nodes_cpus_orig_path): + with open(numa_nodes_cpus_orig_path, "r") as fp_nnco: + # Load the dictionary from the file + self.numa_nodes_cpus_orig = json.load(fp_nnco) + display.vv("initialize numa_nodes_cpus_orig from file") + display.vvv("read numa_nodes_cpus_orig from file: %s -> %s" % (numa_nodes_cpus_orig_path, self.numa_nodes_cpus_orig)) + + if os.path.isfile(host_os_cpus_path): + with open(host_os_cpus_path, "r") as fp: + # Load the dictionary from the file + self.host_os_cpus_dict = json.load(fp) + display.vv("initialize host_os_cpus from file") + display.vvv("read host_os_cpus from file: %s -> %s" % (host_os_cpus_path, self.host_os_cpus_dict)) + + if os.path.isfile(vm_cpus_path): + with open(vm_cpus_path, "r") as fp_vm: + # Load the dictionary from the file + self.cpu_list_dict = json.load(fp_vm) + display.vv("initialize allocated cpus for VM: %s from file" % self.name) + display.vvv("read allocated cpus for VM: %s from file: %s -> %s" % (self.name, vm_cpus_path, self.cpu_list_dict)) + if self.cpus: if self.numa is None: msg = "'numa' parameter has to be used together with 'cpus' parameter" self.result['cpus'] = self.cpus - if self.name is None: - msg = "'name' parameter is required" - if self.pinning is None: msg = "'pinning' argument is mandatory" @@ -179,41 +243,44 @@ def run(self, tmp=None, task_vars=None): ", 'cpus' parameter have to be prepared in advance e.g.: via running module with" " pinning=false") - if self.pinning and self.alloc_all and (not self.cpus or self.numa): - msg = ("When using parameters pinning=true and alloc_all=true, 'numa' parameter is None" - ", 'cpus' parameter have to be prepared in advance e.g.: via running module with " - "pinning=false") - if msg: raise AnsibleActionFail(msg) - if task_vars['numa_nodes_cpus']: - self.numa_nodes_cpus = task_vars['numa_nodes_cpus'] - else: - # Gather hardware information - if not self.numa_nodes_cpus: - self._numa_nodes_cpus() - self.numa_nodes_cpus_orig = copy.deepcopy(self.numa_nodes_cpus) - task_vars['numa_nodes_cpus'] = self.numa_nodes_cpus - task_vars['numa_nodes_cpus_orig'] = self.numa_nodes_cpus_orig - task_vars['numa_nodes'] = self.numa_nodes - - if task_vars['numa_nodes_cpus_orig']: - self.numa_nodes_cpus_orig = task_vars['numa_nodes_cpus_orig'] - - if task_vars['numa_nodes']: - self.numa_nodes = task_vars['numa_nodes'] - - if task_vars['host_os_cpus']: - self.host_os_cpus = task_vars['host_os_cpus'] + # Gather hardware information if not available yet + if not self.numa_nodes_cpus: + self._numa_nodes_cpus() + self.numa_nodes_cpus_orig = copy.deepcopy(self.numa_nodes_cpus) + + if len(self.host_os_cpus_dict) > 0: + numa = f"node{0}" + self.host_os_cpus += self.host_os_cpus_dict[numa][0] + self.host_os_cpus += self.host_os_cpus_dict[numa][1] + self.host_os_cpus = sorted(self.host_os_cpus) + display.vvv("host_os_cpus from dict: %s" % (self.host_os_cpus)) + if len(self.host_os_cpus) != int(self.number_host_os): + display.vvv("number of host_os_cpus from stored file: %s differs from requested number: %s" % (len(self.host_os_cpus), self.number_host_os)) + # Release old host_os_cpus allocation + self.numa_nodes_cpus = self._merge_dicts(self.numa_nodes_cpus, self.host_os_cpus_dict) + self.host_os_cpus_dict = {} + self.host_os_cpus = [] + if os.path.isfile(host_os_cpus_path): + os.remove(host_os_cpus_path) + # End of Release old host_os_cpus allocation if not self.pinning: # Run sanity checks - self._sanity_checks(task_vars) - if self.alloc_all: - task_vars = self._allocate_all_cpus(task_vars) - else: - task_vars = self._allocate_cpus(task_vars) + self._sanity_checks() + if not self._use_stored_allocation(): + # Release old unused allocation + self.numa_nodes_cpus = self._merge_dicts(self.numa_nodes_cpus, self.cpu_list_dict) + self.cpu_list_dict = {} + if os.path.isfile(vm_cpus_path): + os.remove(vm_cpus_path) + # End of Release old unused allocation + if self.alloc_all: + self._allocate_all_cpus() + else: + self._allocate_cpus() self._cpus_list_to_string() else: self.result['pinning'] = self.pinning @@ -228,17 +295,49 @@ def run(self, tmp=None, task_vars=None): self.result['numa'] = self.numa self.result['number_host_os'] = int(self.number_host_os) self.result['alloc_all'] = self.alloc_all + + if not os.path.isfile(numa_nodes_path): + with open(numa_nodes_path, "w+") as fp_nn: + # Store the dictionary to the file (in JSON format) + json.dump(self.numa_nodes, fp_nn) + display.vvv("store numa_nodes to file: %s -> %s" % (numa_nodes_path, self.numa_nodes)) + + with open(numa_nodes_cpus_path, "w+") as fp_nnc: + # Store the dictionary to the file (in JSON format) + json.dump(self.numa_nodes_cpus, fp_nnc) + display.vv("store numa_nodes_cpus to file: %s -> %s" % (numa_nodes_cpus_path, self.numa_nodes_cpus)) + + if not os.path.isfile(numa_nodes_cpus_orig_path): + with open(numa_nodes_cpus_orig_path, "w+") as fp_nnco: + # Store the dictionary to the file (in JSON format) + json.dump(self.numa_nodes_cpus_orig, fp_nnco) + display.vvv("store numa_nodes_cpus_orig to file: %s -> %s" % (numa_nodes_cpus_orig_path, self.numa_nodes_cpus_orig)) + + if not os.path.isfile(host_os_cpus_path): + with open(host_os_cpus_path, "w+") as fp: + # Store the dictionary to the file (in JSON format) + json.dump(self.host_os_cpus_dict, fp) + display.vvv("store host_os_cpus to file: %s -> %s" % (host_os_cpus_path, self.host_os_cpus_dict)) + + if not os.path.isfile(vm_cpus_path): + with open(vm_cpus_path, "w+") as fp_vm: + # Store allocated cpus to the file (in JSON format) + json.dump(self.cpu_list_dict, fp_vm) + display.vvv("store allocated cpus for VM: %s to file: %s -> %s" % (self.name, vm_cpus_path, self.cpu_list_dict)) + + display.vv("return from cpupin plugin") return dict(self.result) - def _sanity_checks(self, task_vars): + def _sanity_checks(self): """Sanity checks of input values. Input values: @self.number @self.cpus @self.numa @self.pinning - @task_vars - Return value: @task_vars/AnsibleActionFail() + @self.numa_nodes_cpus + Return value: @self.numa_nodes_cpus + @AnsibleActionFail() """ kwargs = {'number': self.number, 'cpus': self.cpus, @@ -258,8 +357,7 @@ def _sanity_checks(self, task_vars): # number of requested CPUs if kwargs['number']: - task_vars = self._allocate_host_os_cpus(task_vars) - self.numa_nodes_cpus = task_vars['numa_nodes_cpus'] + self._allocate_host_os_cpus() # Calculate number of unallocated CPUs if not self.alloc_all: self._number_of_unallocated_host_cpus(self.numa) @@ -305,6 +403,58 @@ def _sanity_checks(self, task_vars): if msg: raise AnsibleActionFail(msg) + def _use_stored_allocation(self): + """Check stored allocation if it can be reused. + + Input values: @self.cpu_list_dict + @self.numa + @self.number + @self.cpus + Return value: True/False + """ + if len(self.cpu_list_dict) == 0: + display.vvv("stored allocation is empty") + return False + + display.vvv("use_stored_allocation self.numa: %s" % self.numa) + if self.numa is not None and len(self.cpu_list_dict) != 1: + display.vvv("stored allocation contains more than one numa node") + return False + tmp_cpu_list = [] + for key in self.cpu_list_dict.keys(): + if self.numa is not None: + node = f"node{self.numa}" + if node != key: + display.vvv("numa node from stored allocation differs from requested numa node") + return False + + if len(self.cpu_list_dict) == 1: + # strip 'node' prefix from key + self.numa = int(key[len('node'):]) + display.vvv("set numa from stored allocation: %s" % self.numa) + + tmp_cpu_list += self.cpu_list_dict.get(key)[0] + tmp_cpu_list += self.cpu_list_dict.get(key)[1] + + display.vvv("tmp_cpu_list from dict: %s" % (tmp_cpu_list)) + + if len(tmp_cpu_list) != int(self.number): + display.vvv("number of cpus from stored allocation: %s differs from requested number: %s" % (len(tmp_cpu_list), self.number)) + return False + + if self.cpus is not None: + sorted_cpu_list = sorted(self.cpu_list) + sorted_tmp_cpu_list = sorted(tmp_cpu_list) + if sorted_cpu_list != sorted_tmp_cpu_list: + display.vvv("cpus from stored allocation differs from requested cpus") + return False + + self.cpu_list = sorted(tmp_cpu_list) + self.cpu_list_count = len(self.cpu_list) + + display.vvv("stored allocation can be reused") + return True + def _create_plain_cpu_list(self, cpus_str): """Get the string with cpus like '8-9,44-45' and convert it to the list of CPUs like '[8,9,44,45]]' @@ -319,7 +469,30 @@ def _create_plain_cpu_list(self, cpus_str): plain_cpu_list += list(range(int(cpus[0]), int(cpus[0]) + 1)) else: plain_cpu_list += list(range(int(cpus[0]), int(cpus[1]) + 1)) - return plain_cpu_list + return sorted(plain_cpu_list) + + def _create_cpu_list_dict(self, cpu_list, numa): + """Get the cpu_list like '[56, 57, 58, 59, 168, 169, 170, 171]' and convert it to the numa marked dict + of CPUs like '{'node1': [[56, 57, 58, 59], [168, 169, 170, 171]]}' + + Input values: @cpu_list + @numa + @self.numa_node_cpus_orig + Return value: @cpu_list_dict + """ + cpu_list_dict = {} + sel_cpus = [] + sel_threads = [] + node = f"node{numa}" + + for cpu in cpu_list: + if cpu in self.numa_nodes_cpus_orig[node][0]: + sel_cpus.append(cpu) + if cpu in self.numa_nodes_cpus_orig[node][1]: + sel_threads.append(cpu) + cpu_list_dict[node] = [sel_cpus, sel_threads] + display.vv("created cpu list dict: %s" % (cpu_list_dict)) + return cpu_list_dict def _numa_nodes_cpus(self): """Collect information about all NUMA nodes CPUs @@ -350,7 +523,8 @@ def _create_numa_node_cpus_data_structure(self, data): Input value: @data Return value: @self.numa_nodes_cpus """ - core_count = 0 + self._supported_cpu_structure() + last_core = -1 for item in data: cpu = item[0] core = item[1] @@ -359,16 +533,47 @@ def _create_numa_node_cpus_data_structure(self, data): if numa not in self.numa_nodes_cpus: self.numa_nodes_cpus[numa] = [[], []] # CPU - if cpu == core: + if last_core < int(core): self.numa_nodes_cpus[numa][0].append(int(cpu)) - core_count += 1 + last_core = int(core) # Thread - elif cpu == str(int(core) + core_count): + elif cpu == str(int(core) + last_core + 1): self.numa_nodes_cpus[numa][1].append(int(cpu)) else: msg = "Unsupported CPU core structure -> lscpu -p" raise AnsibleActionFail(msg) + def _supported_cpu_structure(self): + display.vv("Supported CPU structures from 'lscpi -p' are:\n" + " 1. Normal case (Xeon server + old core platform)\n" + " # CPU,Core,Socket,Node,,L1d,L1i,L2,L3\n" + " # CPU\n" + " 0,0,0,0,,0,0,0,0\n" + " 1,1,0,0,,1,1,1,0\n" + " 2,2,0,0,,2,2,2,0\n" + " 3,3,0,0,,3,3,3,0\n" + " # Thread\n" + " 4,0,0,0,,0,0,0,0\n" + " 5,1,0,0,,1,1,1,0\n" + " 6,2,0,0,,2,2,2,0\n" + " 7,3,0,0,,3,3,3,0\n" + " 2. ADL case\n" + " # CPU,Core,Socket,Node,,L1d,L1i,L2,L3\n" + " # P core CPU\n" + " 0,0,0,0,,0,0,0,0\n" + " # P core Thread\n" + " 1,0,0,0,,0,0,0,0\n" + " # P core CPU\n" + " 2,1,0,0,,4,4,1,0\n" + " # P core Thread\n" + " 3,1,0,0,,4,4,1,0\n" + " ...\n" + " # E core CPU\n" + " 14,7,0,0,,32,32,8,0\n" + " 15,8,0,0,,33,33,8,0\n" + " ...\n") + + def _cpus_use_same_numa(self): """Make sure that all CPUs are using same NUMA node, specified by 'numa' @@ -397,27 +602,34 @@ def _host_os_cpus_used(self): def _check_if_cpus_is_used(self): """Make sure that requested CPUs are not from list allocated for other VMs - Input value: @self.cpu_list + Input values: @self.cpu_list + @self.cpu_list_dict Return value: True/False """ node = f"node{self.numa}" for cpu in self.cpu_list: - if ((cpu not in self.numa_nodes_cpus[node][0]) and - (cpu not in self.numa_nodes_cpus[node][1])): + if (((cpu not in self.numa_nodes_cpus[node][0]) and + (cpu not in self.numa_nodes_cpus[node][1])) and + self.cpu_list_dict and + ((cpu not in self.cpu_list_dict[node][0]) and + (cpu not in self.cpu_list_dict[node][1]))): + # Requested CPU is not free and is not part of stored allocation for current VM + display.vv("Requested CPU: %s is not free and is not part of stored allocation for current VM: %s" % (cpu, self.name)) return True return False - def _allocate_host_os_cpus(self, task_vars): + def _allocate_host_os_cpus(self): """ Allocate HOST_OS_VCPUS vCPUs for host OS - Input values: @task_vars - Return value: @task_vars + Input values: @self.numa_nodes_cpus + @self.number_host_os] + Return value: @self.host_os_cpus + @self.host_os_cpust_dict """ if not self.host_os_cpus: - self.host_os_cpus = self._select_cpus(task_vars['numa_nodes_cpus'], self.number_host_os, 0) - self._modify_available_host_cpus(self.host_os_cpus, 0, task_vars['numa_nodes_cpus']) - task_vars['host_os_cpus'] = self.host_os_cpus - return task_vars + self.host_os_cpus = self._select_cpus(self.number_host_os, 0, True) + self._modify_available_host_cpus(self.host_os_cpus, 0) + self.host_os_cpus_dict = self._create_cpu_list_dict(self.host_os_cpus, 0) def _number_of_unallocated_host_cpus(self, numa): """Count unallocated host CPUs @@ -434,16 +646,25 @@ def _number_of_unallocated_host_cpus(self, numa): node = f"node{numa}" val = self.numa_nodes_cpus[node] tmp = len(val[0]) + len(val[1]) + display.vv("Unallocated CPUs count: %s for numa: %s" % (tmp, numa)) + if self.cpu_list_dict and node in self.cpu_list_dict: + val = self.cpu_list_dict[node] + tmp += len(val[0]) + len(val[1]) + display.vv("Unallocated CPUs count: %s including stored allocation for numa: %s for current VM: %s" % (tmp, numa, self.name)) self.unallocated_cpus = tmp + self.unallocated_numa = numa else: tmp_numa = None tmp_count = None force_numa = False - if int(self.number) == 0: - force_numa = True for k, val in self.numa_nodes_cpus.items(): tmp = len(val[0]) + len(val[1]) - if tmp >= int(self.number): + display.vv("Unallocated CPUs count: %s for numa: %s" % (tmp, k)) + if self.cpu_list_dict and k in self.cpu_list_dict: + val2 = self.cpu_list_dict[k] + tmp += len(val2[0]) + len(val2[1]) + display.vv("Unallocated CPUs count: %s including stored allocation for numa: %s for current VM: %s" % (tmp, k, self.name)) + if tmp > 0 and tmp >= int(self.number): tmp_numa = k.strip("node") tmp_count = tmp else: @@ -451,90 +672,157 @@ def _number_of_unallocated_host_cpus(self, numa): if tmp > self.unallocated_cpus: self.unallocated_cpus = tmp + self.unallocated_numa = k.strip("node") if force_numa: if tmp_numa: self.numa = tmp_numa self.unallocated_cpus = tmp_count + self.unallocated_numa = tmp_numa + display.vv("set forced numa: %s with CPUs: %s" % (self.numa, self.unallocated_cpus)) + if int(self.number) == 0: + if self.unallocated_numa: + self.numa = self.unallocated_numa + display.vv("set forced numa: %s with max CPUs: %s" % (self.numa, self.unallocated_cpus)) - def _allocate_all_cpus(self, task_vars): + def _allocate_all_cpus(self): """ Allocate all CPUs - Input value: @task_vars - Return value: @task_vars + Input values: @self.numa_nodes_cpus + @self.numa_nodes + Return value: @self.numa_nodes_cpus + @self.cpu_list + @self.number + @self.numa + @AnsibleActionFail() """ for numa in self.numa_nodes: self._number_of_unallocated_host_cpus(numa) - tmp_cpu_list = self._select_cpus(task_vars['numa_nodes_cpus'], self.unallocated_cpus, numa) - self._modify_available_host_cpus(tmp_cpu_list, numa, task_vars['numa_nodes_cpus']) + tmp_cpu_list = self._select_cpus(self.unallocated_cpus, numa) + self._modify_available_host_cpus(tmp_cpu_list, numa) self.cpu_list += tmp_cpu_list self.number = int(self.number) + int(self.unallocated_cpus) - self.result['numa'] = None + self.numa = None # Check if number of allocated vCPUs isn't lower than MINIMUM_VCPUS if int(self.number) < MINIMUM_VCPUS: msg = (f"Number of allocated CPUs {self.number} is less than required minimum " f"{MINIMUM_VCPUS} vCPUs") raise AnsibleActionFail(msg) self.cpu_list.sort() - return task_vars - def _allocate_cpus(self, task_vars): + def _allocate_cpus(self): """ Allocate required number of CPUs - Input value: @task_vars - Return value: @task_vars + Input values: @self.numa_nodes_cpus + @self.numa + @self.number + Return value: @self.numa_nodes_cpus + @self.numa + @self.cpu_list + @self.cpu_list_count """ # Select random NUMA if not self.numa: - self.numa = random.choice(self.numa_nodes) # nosec B311 # pseudo random is not used for security purposes + if len(self.numa_nodes) > 2 and self.unallocated_numa: + # Available memory is equaly distributed to NUMA nodes + # This become issue on platforms with more than 2 NUMA nodes + # Single NUMA node is not able to handle more work VMs + # WA: Asign new VM to NUMA node with highist number of free CPUs + # Real solution would be to check available memory before NUMA node selection + self.numa = self.unallocated_numa + display.vv("WA for more NUMA nodes: Select NUMA node: %s with highist number of free CPUs: %s" % (self.numa, self.unallocated_cpus)) + else: + self.numa = random.choice(self.numa_nodes) # nosec B311 # pseudo random is not used for security purposes + display.vv("Select random NUMA node: %s" % (self.numa)) if not self.cpus: - self.cpu_list = self._select_cpus(task_vars['numa_nodes_cpus'], self.number, self.numa) + self.cpu_list = self._select_cpus(self.number, self.numa) else: if self.cpu_list_count == 0: self.cpu_list = self._create_plain_cpu_list(self.cpus) self.cpu_list_count = len(self.cpu_list) - self._modify_available_host_cpus(self.cpu_list, self.numa, task_vars['numa_nodes_cpus']) - self.result['numa'] = self.numa - return task_vars + if not self.cpu_list_dict: + self.cpu_list_dict = self._create_cpu_list_dict(self.cpu_list, self.numa) + self._modify_available_host_cpus(self.cpu_list, self.numa) - def _select_cpus(self, numa_nodes_cpus, cpus_number, numa): + def _select_cpus(self, cpus_number, numa, host_os=False): """ Select requested number of CPUs from NUMA node - Input values: @task_vars['numa_nodes_cpus'] + Input values: @self.numa_nodes_cpus @cpus_number @numa + @host_os Return values: selected_cpus """ selected_cpus = [] + sel_cpus = [] + sel_threads = [] + req_cpus = 0 node = f"node{numa}" - req_cpus = int(cpus_number) / 2 - sel_cpus = list(numa_nodes_cpus[node][0][0:int(req_cpus)]) - selected_cpus += sel_cpus + if len(self.numa_nodes_cpus[node][0]) == len(self.numa_nodes_cpus[node][1]): + req_cpus = int(cpus_number) / 2 + sel_cpus = list(self.numa_nodes_cpus[node][0][0:int(req_cpus)]) + selected_cpus += sel_cpus + + sel_threads = list(self.numa_nodes_cpus[node][1][0:int(req_cpus)]) + selected_cpus += sel_threads + else: + threads_len = len(self.numa_nodes_cpus[node][1]) + if not host_os: + req_cpus = int(cpus_number) / 2 + if req_cpus <= threads_len: + req_cpus = int(cpus_number) / 2 + sel_cpus = list(self.numa_nodes_cpus[node][0][0:int(req_cpus)]) + selected_cpus += sel_cpus + + sel_threads = list(self.numa_nodes_cpus[node][1][0:int(req_cpus)]) + selected_cpus += sel_threads + else: + sel_cpus = list(self.numa_nodes_cpus[node][0][0:threads_len]) + selected_cpus += sel_cpus + + sel_threads = list(self.numa_nodes_cpus[node][1][0:threads_len]) + selected_cpus += sel_threads + + e_cpus_count = int(int(cpus_number) - 2 * threads_len) + display.vv("select %s missing CPUs from Efficient-cores" % e_cpus_count) + sel_E_cpus = list(self.numa_nodes_cpus[node][0][threads_len:int(threads_len + e_cpus_count)]) + selected_cpus += sel_E_cpus + sel_cpus += sel_E_cpus + else: + if int(len(self.numa_nodes_cpus[node][0]) - threads_len) >= int(cpus_number): + display.vv("select %s host os CPUs from Efficient-cores" % cpus_number) + sel_cpus = list(self.numa_nodes_cpus[node][0][threads_len:int(threads_len + int(cpus_number))]) + selected_cpus += sel_cpus + else: + msg = (f"Not enough Efficient-cores CPUs for host OS: requested: {cpus_number}, available: " + f"{len(self.numa_nodes_cpus[node][0]) - threads_len}") + raise AnsibleActionFail(msg) + - sel_threads = list(numa_nodes_cpus[node][1][0:int(req_cpus)]) - selected_cpus += sel_threads - return selected_cpus + if not host_os: + self.cpu_list_dict[node] = [sel_cpus, sel_threads] + display.vv("selected cpu list dict: %s" % (self.cpu_list_dict)) + return sorted(selected_cpus) - def _modify_available_host_cpus(self, requested_cpus, requested_numa, task_vars): + def _modify_available_host_cpus(self, requested_cpus, requested_numa): """ Modify dictionary with available cpus on host Input values: @requested_cpus @requested_numa - @task_vars['numa_nodes_cpus'] - Return_value: @task_vars['numa_nodes_cpus'] + @self.numa_nodes_cpus + Return_value: @self.numa_nodes_cpus """ node = f"node{requested_numa}" if len(requested_cpus) > 1: tmp_l = requested_cpus # Going through host_cpus and delete requested cpus - for lst in task_vars[node]: + for lst in self.numa_nodes_cpus[node]: for item in tmp_l: if item in lst: # remove item lst.remove(item) - return task_vars def _plain_cpus_list_to_string(self, cpu_list): """ From input CPUs list create string @@ -586,6 +874,36 @@ def _select_emu_cpus(self): """ self.emu_cpus = [self.cpu_list[0], self.cpu_list[int(self.cpu_list_count / 2)]] + def _merge_lists(self, list1, list2): + """ Merge two lists and return ordered list without duplicates + + Input values: @list1 + @list2 + Return value: @merged_list + """ + return sorted(list(set(list1 + list2))) + + def _merge_dicts(self, dict1, dict2): + """ Merge two dicts + + Input values: @dict1 + @dict2 + Return value: @merged_dict + """ + merged_dict = {} + display.vvv("dict1: %s" % (dict1)) + display.vvv("dict2: %s" % (dict2)) + for key in dict1.keys() | dict2.keys(): + if key in dict1 and key in dict2: + merged_dict[key] = [self._merge_lists(dict1[key][0], dict2[key][0]), self._merge_lists(dict1[key][1], dict2[key][1])] + elif key in dict1: + merged_dict[key] = [sorted(dict1[key][0]), sorted(dict1[key][1])] + else: + merged_dict[key] = [sorted(dict2[key][0]), sorted(dict2[key][1])] + merged_dict = dict(sorted(merged_dict.items())) + display.vvv("merged_dict: %s" % (merged_dict)) + return merged_dict + def _pin_cpus(self): """ PIN selected cpus for VM usage diff --git a/ansible.cfg b/ansible.cfg index 264fbc6b..2a05e57a 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -21,3 +21,5 @@ collections_path = ./collections log_path = ./.ansible_last_run.log display_args_to_stdout = False + +vars_plugins_enabled = host_group_vars,git_revision diff --git a/cloud/README.md b/cloud/README.md index 1452144e..4b628f7c 100644 --- a/cloud/README.md +++ b/cloud/README.md @@ -14,11 +14,11 @@ Cloud RA allows for deploying Intel Container Experience Kits on managed Kuberne - Python 3.8+ -- Azure CLI 2.53.0+ ([Install Guide](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt)) +- Azure CLI 2.55.0+ ([Install Guide](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt)) -- AWS CLI 2.13.21+ ([Install Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)) +- AWS CLI 2.14.5+ ([Install Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)) -- Terraform 1.5.7+ +- Terraform 1.6.5+ - Docker 20.10.17+ @@ -81,7 +81,7 @@ azureConfig: sg_whitelist_cidr_blocks: [] enable_proximity_placement: true aks: - kubernetes_version: "1.27" + kubernetes_version: "1.28" cni: "kubenet" # Possible values are: kubenet, cilium enable_sgx: false # Requires DCsv series instances in one of node pools default_node_pool: @@ -120,7 +120,7 @@ awsConfig: sg_whitelist_cidr_blocks: [] ecr_repositories: [] eks: - kubernetes_version: "1.27" + kubernetes_version: "1.28" subnets: ["subnet_a", "subnet_b"] custom_ami: "ubuntu" # Comment out this line to use Amazon Linux 2 OS node_groups: diff --git a/cloud/cwdf_example_aws.yaml b/cloud/cwdf_example_aws.yaml index 097f700b..f4a7b168 100644 --- a/cloud/cwdf_example_aws.yaml +++ b/cloud/cwdf_example_aws.yaml @@ -15,7 +15,7 @@ awsConfig: sg_whitelist_cidr_blocks: [] ecr_repositories: [] eks: - kubernetes_version: "1.27" + kubernetes_version: "1.28" subnets: ["subnet_a", "subnet_b"] custom_ami: "ubuntu" # Comment out this line to use Amazon Linux 2 OS node_groups: diff --git a/cloud/cwdf_example_azure.yaml b/cloud/cwdf_example_azure.yaml index 9e10301c..6cbcd503 100644 --- a/cloud/cwdf_example_azure.yaml +++ b/cloud/cwdf_example_azure.yaml @@ -14,7 +14,7 @@ azureConfig: sg_whitelist_cidr_blocks: [] enable_proximity_placement: true aks: - kubernetes_version: "1.27" + kubernetes_version: "1.28" cni: "kubenet" # Possible values are: kubenet, cilium, cilium-ebpf enable_sgx: false # Requires DCsv series instances in one of node pools default_node_pool: diff --git a/cloud/cwdf_util/config.py b/cloud/cwdf_util/config.py index 8f59b21f..77e1638b 100644 --- a/cloud/cwdf_util/config.py +++ b/cloud/cwdf_util/config.py @@ -24,7 +24,7 @@ Optional("root_volume_type", default='gp2'): str }], Optional("eks"): { - Optional("kubernetes_version", default='1.27'): Or("1.25", "1.26", "1.27"), + Optional("kubernetes_version", default='1.28'): Or("1.26", "1.27", "1.28"), "subnets": [str], Optional("install_ebs_csi_driver", default=True): bool, Optional("custom_ami", default=None): str, @@ -48,7 +48,7 @@ Optional("enable_proximity_placement", default=False): bool, Optional("ansible_instance_size", default="Standard_B2s"): str, Optional("aks"): { - Optional("kubernetes_version", default='1.27'): Or("1.26", "1.27"), + Optional("kubernetes_version", default='1.28'): Or("1.26", "1.27", "1.28"), Optional("cni", default="cilium"): Or("cilium", "kubenet"), Optional("enable_sgx", default=False): bool, "default_node_pool": { diff --git a/cloud/cwdf_util/templates/cloudcli/aws/aws_cloudcli_deploy.sh.j2 b/cloud/cwdf_util/templates/cloudcli/aws/aws_cloudcli_deploy.sh.j2 index 168efbae..24e1ae90 100644 --- a/cloud/cwdf_util/templates/cloudcli/aws/aws_cloudcli_deploy.sh.j2 +++ b/cloud/cwdf_util/templates/cloudcli/aws/aws_cloudcli_deploy.sh.j2 @@ -47,21 +47,35 @@ ANSIBLE_INSTANCE_IMAGE=$(aws ec2 describe-images \ ANSIBLE_INSTANCE_TYPE="t3.medium" ANSIBLE_INSTANCE_NAME="cwdf-infra-{{ cloud_config.job_id }}-ansible-instance" -# Generate Ansible instance SSH Host Key -if [ ! -f ./ansible_host ] +# Generate Ansible instance SSH Host Keys +if [ ! -f ${SCRIPT_DIR}/ansible_host_rsa ] then - ssh-keygen -q -N "" -t rsa -f ./ansible_host + ssh-keygen -q -N "" -t rsa -f ${SCRIPT_DIR}/ansible_host_rsa fi -ANSIBLE_INSTANCE_HOST_PRIVKEY=$(cat ./ansible_host) -ANSIBLE_INSTANCE_HOST_PUBKEY=$(cat ./ansible_host.pub) +ANSIBLE_INSTANCE_HOST_RSA_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host_rsa) +ANSIBLE_INSTANCE_HOST_RSA_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host_rsa.pub) + +if [ ! -f ${SCRIPT_DIR}/ansible_host_ed25519 ] +then + ssh-keygen -q -N "" -t ed25519 -f ${SCRIPT_DIR}/ansible_host_ed25519 +fi +ANSIBLE_INSTANCE_HOST_ED25519_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host_ed25519) +ANSIBLE_INSTANCE_HOST_ED25519_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host_ed25519.pub) + +if [ ! -f ${SCRIPT_DIR}/ansible_host_ecdsa ] +then + ssh-keygen -q -N "" -t ecdsa -f ${SCRIPT_DIR}/ansible_host_ecdsa +fi +ANSIBLE_INSTANCE_HOST_ECDSA_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host_ecdsa) +ANSIBLE_INSTANCE_HOST_ECDSA_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host_ecdsa.pub) # Ansible instance entrypoint script ANSIBLE_INSTANCE_ENTRYPOINT="$(cat <<- "EOM" #!/usr/bin/env bash -echo $ANSIBLE_INSTANCE_HOST_PRIVKEY > /etc/ssh/ssh_host_rsa_key +echo $ANSIBLE_INSTANCE_HOST_RSA_PRIVKEY > /etc/ssh/ssh_host_rsa_key +echo $ANSIBLE_INSTANCE_HOST_ED25519_PRIVKEY > /etc/ssh/ssh_host_ed25519_key +echo $ANSIBLE_INSTANCE_HOST_ECDSA_PRIVKEY > /etc/ssh/ssh_host_ecdsa_key rm /etc/ssh/ssh_host_dsa_key -rm /etc/ssh/ssh_host_ed25519_key -rm /etc/ssh/ssh_host_ecdsa_key apt-get -qq -y update apt-get -qq -y upgrade apt-get -qq -y install python3-pip python3-venv @@ -371,13 +385,19 @@ JSON_OUTPUT=$(jq -n \ --arg cloud_provider "aws" \ --arg cr_url $ECR_URL \ --arg k8s_worker_username "ubuntu" \ - --arg host_key "$ANSIBLE_INSTANCE_HOST_PUBKEY" \ + --arg rsa_host_key "$ANSIBLE_INSTANCE_HOST_RSA_PUBKEY" \ + --arg ecdsa_host_key "$ANSIBLE_INSTANCE_HOST_ECDSA_PUBKEY" \ + --arg ed25519_host_key "$ANSIBLE_INSTANCE_HOST_ED25519_PUBKEY" \ '{ansible_host_public_ip: {value: $ansible_host_ip}, cloud_provider: {value: $cloud_provider}, - ansible_ssh_host_key: - {value: $host_key}, + ansible_ssh_rsa_host_key: + {value: $rsa_host_key}, + ansible_ssh_ecdsa_host_key: + {value: $ecdsa_host_key}, + ansible_ssh_ed25519_host_key: + {value: $ed25519_host_key}, cr_url: {value: $cr_url}, k8s_worker_username: diff --git a/cloud/cwdf_util/templates/cloudcli/azure/azure_cloudcli_deploy.sh.j2 b/cloud/cwdf_util/templates/cloudcli/azure/azure_cloudcli_deploy.sh.j2 index eafa795d..ad4e1134 100644 --- a/cloud/cwdf_util/templates/cloudcli/azure/azure_cloudcli_deploy.sh.j2 +++ b/cloud/cwdf_util/templates/cloudcli/azure/azure_cloudcli_deploy.sh.j2 @@ -30,21 +30,35 @@ NIC_NAME="cwdf-infra-{{ cloud_config.job_id }}-ansible-instance-nic" ANSIBLE_INSTANCE_NAME="cwdf-infra-{{ cloud_config.job_id }}-ansible-instance" ANSIBLE_INSTANCE_IMAGE="Canonical:0001-com-ubuntu-server-jammy:22_04-lts-gen2:latest" -# Generate Ansible instance SSH Host Key -if [ ! -f ${SCRIPT_DIR}/ansible_host ] +# Generate Ansible instance SSH Host Keys +if [ ! -f ${SCRIPT_DIR}/ansible_host_rsa ] then - ssh-keygen -q -N "" -t rsa -f ${SCRIPT_DIR}/ansible_host + ssh-keygen -q -N "" -t rsa -f ${SCRIPT_DIR}/ansible_host_rsa fi -ANSIBLE_INSTANCE_HOST_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host) -ANSIBLE_INSTANCE_HOST_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host.pub) +ANSIBLE_INSTANCE_HOST_RSA_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host_rsa) +ANSIBLE_INSTANCE_HOST_RSA_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host_rsa.pub) + +if [ ! -f ${SCRIPT_DIR}/ansible_host_ed25519 ] +then + ssh-keygen -q -N "" -t ed25519 -f ${SCRIPT_DIR}/ansible_host_ed25519 +fi +ANSIBLE_INSTANCE_HOST_ED25519_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host_ed25519) +ANSIBLE_INSTANCE_HOST_ED25519_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host_ed25519.pub) + +if [ ! -f ${SCRIPT_DIR}/ansible_host_ecdsa ] +then + ssh-keygen -q -N "" -t ecdsa -f ${SCRIPT_DIR}/ansible_host_ecdsa +fi +ANSIBLE_INSTANCE_HOST_ECDSA_PRIVKEY=$(cat ${SCRIPT_DIR}/ansible_host_ecdsa) +ANSIBLE_INSTANCE_HOST_ECDSA_PUBKEY=$(cat ${SCRIPT_DIR}/ansible_host_ecdsa.pub) # Ansible instance entrypoint script ANSIBLE_INSTANCE_ENTRYPOINT="$(cat <<- "EOM" #!/usr/bin/env bash -echo $ANSIBLE_INSTANCE_HOST_PRIVKEY > /etc/ssh/ssh_host_rsa_key +echo $ANSIBLE_INSTANCE_HOST_RSA_PRIVKEY > /etc/ssh/ssh_host_rsa_key +echo $ANSIBLE_INSTANCE_HOST_ED25519_PRIVKEY > /etc/ssh/ssh_host_ed25519_key +echo $ANSIBLE_INSTANCE_HOST_ECDSA_PRIVKEY > /etc/ssh/ssh_host_ecdsa_key rm /etc/ssh/ssh_host_dsa_key -rm /etc/ssh/ssh_host_ed25519_key -rm /etc/ssh/ssh_host_ecdsa_key mkdir -p /etc/apt/keyrings curl -sLS https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | @@ -491,13 +505,19 @@ JSON_OUTPUT=$(jq -n \ --arg k8s_worker_username "azureuser" \ --arg rg_name "$AZ_GROUP_NAME" \ --arg sub_id "$SUBSCRIPTION_ID" \ - --arg host_key "$ANSIBLE_INSTANCE_HOST_PUBKEY" \ + --arg rsa_host_key "$ANSIBLE_INSTANCE_HOST_RSA_PUBKEY" \ + --arg ecdsa_host_key "$ANSIBLE_INSTANCE_HOST_ECDSA_PUBKEY" \ + --arg ed25519_host_key "$ANSIBLE_INSTANCE_HOST_ED25519_PUBKEY" \ '{aks_cluster_name: {value: $aks_name}, aks_scale_sets_rg: {value: $aks_scale_set}, - ansible_host_public_ip: - {value: $ansible_host_ip}, + ansible_ssh_rsa_host_key: + {value: $rsa_host_key}, + ansible_ssh_ecdsa_host_key: + {value: $ecdsa_host_key}, + ansible_ssh_ed25519_host_key: + {value: $ed25519_host_key}, cloud_provider: {value: $cloud_provider}, ansible_ssh_host_key: diff --git a/cloud/cwdf_util/templates/terraform/aws/ansible_host.tf.jinja b/cloud/cwdf_util/templates/terraform/aws/ansible_host.tf.jinja index a33333d4..3e9f7f0f 100644 --- a/cloud/cwdf_util/templates/terraform/aws/ansible_host.tf.jinja +++ b/cloud/cwdf_util/templates/terraform/aws/ansible_host.tf.jinja @@ -148,6 +148,15 @@ resource "tls_private_key" "ansible_ssh_rsa_host_key" { rsa_bits = 4096 } +resource "tls_private_key" "ansible_ssh_ecdsa_host_key" { + algorithm = "ECDSA" + ecdsa_curve = "P384" +} + +resource "tls_private_key" "ansible_ssh_ed25519_host_key" { + algorithm = "ED25519" +} + resource "aws_instance" "ansible" { ami = data.aws_ami.ubuntu2204.id instance_type = "{{ ansible_instance_type }}" @@ -171,9 +180,9 @@ resource "aws_instance" "ansible" { user_data = < /etc/ssh/ssh_host_rsa_key +echo '${tls_private_key.ansible_ssh_ecdsa_host_key.private_key_pem}' > /etc/ssh/ssh_host_ecdsa_key +echo '${tls_private_key.ansible_ssh_ed25519_host_key.private_key_pem}' > /etc/ssh/ssh_host_ed25519_key rm /etc/ssh/ssh_host_dsa_key -rm /etc/ssh/ssh_host_ed25519_key -rm /etc/ssh/ssh_host_ecdsa_key apt-get -qq -y update apt-get -qq -y upgrade apt-get -qq -y install python3-pip python3-venv @@ -217,6 +226,14 @@ output "ansible_host_public_ip" { value = aws_eip.ansible.public_ip } -output "ansible_host_ssh_host_key" { +output "ansible_host_ssh_host_key_rsa" { value = tls_private_key.ansible_ssh_rsa_host_key.public_key_openssh } + +output "ansible_host_ssh_host_key_ecdsa" { + value = tls_private_key.ansible_ssh_ecdsa_host_key.public_key_openssh +} + +output "ansible_host_ssh_host_key_ed25519" { + value = tls_private_key.ansible_ssh_ed25519_host_key.public_key_openssh +} diff --git a/cloud/cwdf_util/templates/terraform/aws/provider.tf.jinja b/cloud/cwdf_util/templates/terraform/aws/provider.tf.jinja index 780fd121..c704d7af 100644 --- a/cloud/cwdf_util/templates/terraform/aws/provider.tf.jinja +++ b/cloud/cwdf_util/templates/terraform/aws/provider.tf.jinja @@ -2,15 +2,15 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "5.17.0" + version = "5.31.0" } kubernetes = { source = "hashicorp/kubernetes" - version = "2.23.0" + version = "2.24.0" } helm = { source = "hashicorp/helm" - version = "2.11.0" + version = "2.12.1" } } } diff --git a/cloud/cwdf_util/templates/terraform/azure/aks.tf.jinja b/cloud/cwdf_util/templates/terraform/azure/aks.tf.jinja index f36c2c9f..581f233f 100644 --- a/cloud/cwdf_util/templates/terraform/azure/aks.tf.jinja +++ b/cloud/cwdf_util/templates/terraform/azure/aks.tf.jinja @@ -33,6 +33,10 @@ resource "azurerm_kubernetes_cluster" "default" { kubelet_config { cpu_manager_policy = "{{ aks.default_node_pool.kubelet_cpu_manager_policy }}" } + + upgrade_settings { + max_surge = "10%" + } } network_profile { diff --git a/cloud/cwdf_util/templates/terraform/azure/ansible_host.tf.jinja b/cloud/cwdf_util/templates/terraform/azure/ansible_host.tf.jinja index c2a413c3..7b4df461 100644 --- a/cloud/cwdf_util/templates/terraform/azure/ansible_host.tf.jinja +++ b/cloud/cwdf_util/templates/terraform/azure/ansible_host.tf.jinja @@ -44,6 +44,15 @@ resource "tls_private_key" "ansible_ssh_rsa_host_key" { rsa_bits = 4096 } +resource "tls_private_key" "ansible_ssh_ecdsa_host_key" { + algorithm = "ECDSA" + ecdsa_curve = "P384" +} + +resource "tls_private_key" "ansible_ssh_ed25519_host_key" { + algorithm = "ED25519" +} + resource "azurerm_linux_virtual_machine" "ansible_instance" { name = "cwdf-infra-{{ job_id }}-ansible-instance" resource_group_name = azurerm_resource_group.default.name @@ -85,9 +94,9 @@ resource "azurerm_linux_virtual_machine" "ansible_instance" { custom_data = base64encode(< /etc/ssh/ssh_host_rsa_key +echo '${tls_private_key.ansible_ssh_ecdsa_host_key.private_key_pem}' > /etc/ssh/ssh_host_ecdsa_key +echo '${tls_private_key.ansible_ssh_ed25519_host_key.private_key_pem}' > /etc/ssh/ssh_host_ed25519_key rm /etc/ssh/ssh_host_dsa_key -rm /etc/ssh/ssh_host_ed25519_key -rm /etc/ssh/ssh_host_ecdsa_key mkdir -p /etc/apt/keyrings curl -sLS https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | @@ -264,6 +273,14 @@ output "ansible_host_public_ip" { value = azurerm_public_ip.ansible_instance.ip_address } -output "ansible_host_ssh_host_key" { +output "ansible_host_ssh_host_key_rsa" { value = tls_private_key.ansible_ssh_rsa_host_key.public_key_openssh } + +output "ansible_host_ssh_host_key_ecdsa" { + value = tls_private_key.ansible_ssh_ecdsa_host_key.public_key_openssh +} + +output "ansible_host_ssh_host_key_ed25519" { + value = tls_private_key.ansible_ssh_ed25519_host_key.public_key_openssh +} diff --git a/cloud/cwdf_util/templates/terraform/azure/provider.tf.jinja b/cloud/cwdf_util/templates/terraform/azure/provider.tf.jinja index c6d54c29..91b085a0 100644 --- a/cloud/cwdf_util/templates/terraform/azure/provider.tf.jinja +++ b/cloud/cwdf_util/templates/terraform/azure/provider.tf.jinja @@ -2,11 +2,11 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.74.0" + version = "3.85.0" } helm = { source = "hashicorp/helm" - version = "2.11.0" + version = "2.12.1" } } } diff --git a/cloud/deployer.py b/cloud/deployer.py index f1f9f23b..b11b1ca6 100644 --- a/cloud/deployer.py +++ b/cloud/deployer.py @@ -156,12 +156,17 @@ def deploy(deployment_dir, provisioner_tool): click.echo("Public ip: " + worker["public_ip"]) click.echo("-------------------") ssh_username = provisioning_output["k8s_worker_username"]["value"] - ssh_host_key_raw = provisioning_output["ansible_host_ssh_host_key"]["value"][8:] - ssh_host_key = SSHHostKey("ssh-rsa", ssh_host_key_raw) + rsa_ssh_host_key_raw = provisioning_output["ansible_host_ssh_host_key_rsa"]["value"].split(' ')[1] + ecdsa_ssh_host_key_raw = provisioning_output["ansible_host_ssh_host_key_ecdsa"]["value"].split(' ')[1] + ecdsa_ssh_host_key_type = provisioning_output["ansible_host_ssh_host_key_ecdsa"]["value"].split(' ')[0] + ed25519_ssh_host_key_raw = provisioning_output["ansible_host_ssh_host_key_ed25519"]["value"].split(' ')[1] + rsa_ssh_host_key = SSHHostKey("ssh-rsa", rsa_ssh_host_key_raw) + ecdsa_ssh_host_key = SSHHostKey(ecdsa_ssh_host_key_type, ecdsa_ssh_host_key_raw) + ed25519_ssh_host_key = SSHHostKey("ssh-ed25519", ed25519_ssh_host_key_raw) click.echo("Opening SSH connection to Ansible host...") ssh = SSHConnector(ip_address=ansible_host_ip, username='ubuntu', - host_keys=[ssh_host_key], + host_keys=[rsa_ssh_host_key, ecdsa_ssh_host_key, ed25519_ssh_host_key], priv_key=private_key_path, try_loop=True) click.echo("Opened SSH connection.") @@ -218,7 +223,9 @@ def deploy(deployment_dir, provisioner_tool): with open(file=sw_config_path, mode='r', encoding='utf-8') as file: sw_configuration = yaml.load(file, Loader=yaml.FullLoader) sw_configuration['ansible_host_ip'] = ansible_host_ip - sw_configuration['ansible_ssh_host_key'] = ssh_host_key_raw + sw_configuration['ansible_ssh_rsa_host_key'] = rsa_ssh_host_key_raw + sw_configuration['ansible_ssh_ecdsa_host_key'] = ecdsa_ssh_host_key_raw + sw_configuration['ansible_ssh_ed25519_host_key'] = ed25519_ssh_host_key_raw sw_configuration['worker_ips'] = workers_ip sw_configuration['ssh_user'] = ssh_username sw_configuration['ssh_key'] = os.path.join('..', private_key_path) diff --git a/cloud/discovery/cpu_arch.yml b/cloud/discovery/cpu_arch.yml index fcb05b3f..7324d366 100644 --- a/cloud/discovery/cpu_arch.yml +++ b/cloud/discovery/cpu_arch.yml @@ -3,7 +3,61 @@ architectures: name: 'Sapphire Rapids' note: '4th Generation Intel(R) Xeon(R) Scalable Processor' models: - - 'Not available yet.' + - 'Intel(R) Xeon(R) Gold 5403N' + - 'Intel(R) Xeon(R) Gold 6403N' + - 'Intel(R) Xeon(R) Gold 6423N' + - 'Intel(R) Xeon(R) Gold 6433N' + - 'Intel(R) Xeon(R) Gold 6433NE' + - 'Intel(R) Xeon(R) Gold 6443N' + - 'Intel(R) Xeon(R) Platinum 8444H' + - 'Intel(R) Xeon(R) Platinum 8450H' + - 'Intel(R) Xeon(R) Platinum 8452Y' + - 'Intel(R) Xeon(R) Platinum 8454H' + - 'Intel(R) Xeon(R) Platinum 8458P' + - 'Intel(R) Xeon(R) Platinum 8460H' + - 'Intel(R) Xeon(R) Platinum 8460Y+' + - 'Intel(R) Xeon(R) Platinum 8461V' + - 'Intel(R) Xeon(R) Platinum 8462Y+' + - 'Intel(R) Xeon(R) Platinum 8468' + - 'Intel(R) Xeon(R) Platinum 8468H' + - 'Intel(R) Xeon(R) Platinum 8468V' + - 'Intel(R) Xeon(R) Platinum 8470' + - 'Intel(R) Xeon(R) Platinum 8470N' + - 'Intel(R) Xeon(R) Platinum 8470Q' + - 'Intel(R) Xeon(R) Platinum 8471N' + - 'Intel(R) Xeon(R) Platinum 8480+' + - 'Intel(R) Xeon(R) Platinum 8490H' + - 'Intel(R) Xeon(R) Gold 5411N' + - 'Intel(R) Xeon(R) Gold 5412U' + - 'Intel(R) Xeon(R) Gold 5415+' + - 'Intel(R) Xeon(R) Gold 5416S' + - 'Intel(R) Xeon(R) Gold 5418N' + - 'Intel(R) Xeon(R) Gold 5418Y' + - 'Intel(R) Xeon(R) Gold 5420+' + - 'Intel(R) Xeon(R) Gold 5423N' + - 'Intel(R) Xeon(R) Gold 5433N' + - 'Intel(R) Xeon(R) Gold 6414U' + - 'Intel(R) Xeon(R) Gold 6416H' + - 'Intel(R) Xeon(R) Gold 6418H' + - 'Intel(R) Xeon(R) Gold 6421N' + - 'Intel(R) Xeon(R) Gold 6426Y' + - 'Intel(R) Xeon(R) Gold 6428N' + - 'Intel(R) Xeon(R) Gold 6430' + - 'Intel(R) Xeon(R) Gold 6434' + - 'Intel(R) Xeon(R) Gold 6434H' + - 'Intel(R) Xeon(R) Gold 6438M' + - 'Intel(R) Xeon(R) Gold 6438N' + - 'Intel(R) Xeon(R) Gold 6438Y+' + - 'Intel(R) Xeon(R) Gold 6442Y' + - 'Intel(R) Xeon(R) Gold 6444Y' + - 'Intel(R) Xeon(R) Gold 6448H' + - 'Intel(R) Xeon(R) Gold 6448Y' + - 'Intel(R) Xeon(R) Gold 6454S' + - 'Intel(R) Xeon(R) Gold 6458Q' + - 'Intel(R) Xeon(R) Silver 4410T' + - 'Intel(R) Xeon(R) Silver 4410Y' + - 'Intel(R) Xeon(R) Silver 4416+' + - 'Intel(R) Xeon(R) Bronze 3408U' icx: name: 'IceLake' note: '3rd Generation Intel(R) Xeon(R) Scalable Processor' diff --git a/cloud/discovery/profiles.yml b/cloud/discovery/profiles.yml index 546fb97d..2d8355a2 100644 --- a/cloud/discovery/profiles.yml +++ b/cloud/discovery/profiles.yml @@ -64,6 +64,7 @@ # ddp_update # fw_update # - intel_sriov_fec_operator +# - infra_power_manager --- access: @@ -133,6 +134,7 @@ access: ddp_update: optional fw_update: optional intel_sriov_fec_operator: on + infra_power_manager: optional basic: name: basic @@ -165,6 +167,7 @@ basic: enabled: optional flow_config: optional fw_update: optional + infra_power_manager: optional full_nfv: name: full_nfv @@ -228,6 +231,7 @@ full_nfv: ddp_update: optional fw_update: optional intel_sriov_fec_operator: optional + infra_power_manager: optional on_prem: name: on_prem @@ -282,6 +286,7 @@ on_prem: enabled: optional flow_config: optional fw_update: optional + infra_power_manager: optional regional_dc: name: regional_dc @@ -323,6 +328,7 @@ regional_dc: enabled: optional flow_config: optional fw_update: optional + infra_power_manager: optional remote_fp: name: remote_fp @@ -380,6 +386,7 @@ remote_fp: flow_config: optional ddp_update: optional fw_update: optional + infra_power_manager: optional storage: name: storage @@ -415,6 +422,7 @@ storage: flow_config: optional ddp_update: optional fw_update: optional + infra_power_manager: optional build_your_own: name: build_your_own @@ -478,3 +486,4 @@ build_your_own: ddp_update: optional fw_update: optional intel_sriov_fec_operator: optional + infra_power_manager: optional diff --git a/cloud/requirements.txt b/cloud/requirements.txt index 1b9ff99d..cb931167 100644 --- a/cloud/requirements.txt +++ b/cloud/requirements.txt @@ -1,14 +1,13 @@ -click~=8.1.3 -PyYAML~=6.0 -schema~=0.7.5 -Jinja2~=3.1.2 -paramiko~=2.11.0 -scp~=0.14.4 -pycryptodome~=3.15.0 -validators~=0.20.0 -docker~=6.0.0 -boto3~=1.24.60 -GitPython~=3.1.27 -azure.identity~=1.11.0 -azure.mgmt.network~=22.1.0 -azure.mgmt.compute~=29.0.0 \ No newline at end of file +click==8.1.7 +PyYAML==6.0.1 +schema==0.7.5 +Jinja2==3.1.3 +scp==0.14.5 +pycryptodome==3.19.1 +validators==0.22.0 +docker==6.1.3 +boto3==1.34.15 +GitPython==3.1.41 +azure.identity==1.15.0 +azure.mgmt.network==25.2.0 +azure.mgmt.compute==30.4.0 \ No newline at end of file diff --git a/cloud/sw_deployment/sw_deployment_tool.py b/cloud/sw_deployment/sw_deployment_tool.py index ebfa894c..f6a1603c 100644 --- a/cloud/sw_deployment/sw_deployment_tool.py +++ b/cloud/sw_deployment/sw_deployment_tool.py @@ -18,7 +18,9 @@ 'region': None }, 'ansible_host_ip': None, - 'ansible_ssh_host_key': None, + 'ansible_ssh_rsa_host_key': None, + 'ansible_ssh_ecdsa_host_key': None, + 'ansible_ssh_ed25519_host_key': None, 'controller_ips': [], 'worker_ips': [], 'ssh_key': None, @@ -332,8 +334,10 @@ def _docker_login(node_ips, ssh_client, user, registry, registry_username, passw """ for node_ip in node_ips: - ssh_host_key = SSHHostKey("ssh-rsa", configuration['ansible_ssh_host_key']) - ssh_node = SSHConnector(node_ip, user, 22, [ssh_host_key], configuration['ssh_key'], ssh_client.client) + ssh_host_key_rsa = SSHHostKey("ssh-rsa", configuration['ansible_ssh_rsa_host_key']) + ssh_host_key_ecdsa = SSHHostKey("ecdsa", configuration['ansible_ssh_ecdsa_host_key']) + ssh_host_key_ed25519 = SSHHostKey("ed25519", configuration['ansible_ssh_ed25519_host_key']) + ssh_node = SSHConnector(node_ip, user, 22, [ssh_host_key_rsa, ssh_host_key_ecdsa, ssh_host_key_ed25519], configuration['ssh_key'], ssh_client.client) ssh_node.exec_command(command=f"docker login {registry} --username {registry_username} --password {password}", print_output=True) ssh_node.close_connection() @@ -356,8 +360,15 @@ def cleanup(config): _parse_configuration_file(config=config) - ssh_host_key = SSHHostKey("ssh-rsa", configuration['ansible_ssh_host_key']) - client = SSHConnector(ip_address=configuration['ansible_host_ip'], username='ubuntu', host_keys=[ssh_host_key], priv_key=configuration['ssh_key']) + ssh_host_key_rsa = SSHHostKey("ssh-rsa", configuration['ansible_ssh_rsa_host_key']) + ssh_host_key_ecdsa = SSHHostKey("ecdsa", configuration['ansible_ssh_ecdsa_host_key']) + ssh_host_key_ed25519 = SSHHostKey("ed25519", configuration['ansible_ssh_ed25519_host_key']) + client = SSHConnector( + ip_address=configuration['ansible_host_ip'], + username='ubuntu', + host_keys=[ssh_host_key_rsa, ssh_host_key_ecdsa, ssh_host_key_ed25519], + priv_key=configuration['ssh_key'] + ) for image in configuration['exec_containers']: image_name = image.replace('/', '-') @@ -384,8 +395,15 @@ def _deploy(provider, ansible_host_ip, ssh_key, ssh_user, custom_ami): """ click.echo("-------------------") click.secho(f"Connecting to Ansible instance with IP: {configuration['ansible_host_ip']}", fg="yellow") - ssh_host_key = SSHHostKey("ssh-rsa", configuration['ansible_ssh_host_key']) - client = SSHConnector(ip_address=ansible_host_ip, username='ubuntu', host_keys=[ssh_host_key], priv_key=ssh_key) + ssh_host_key_rsa = SSHHostKey("ssh-rsa", configuration['ansible_ssh_rsa_host_key']) + ssh_host_key_ecdsa = SSHHostKey("ecdsa", configuration['ansible_ssh_ecdsa_host_key']) + ssh_host_key_ed25519 = SSHHostKey("ed25519", configuration['ansible_ssh_ed25519_host_key']) + client = SSHConnector( + ip_address=ansible_host_ip, + username='ubuntu', + host_keys=[ssh_host_key_rsa, ssh_host_key_ecdsa, ssh_host_key_ed25519], + priv_key=ssh_key + ) click.echo("-------------------") click.secho("Copy private SSH key to Ansible instance", fg="yellow") @@ -515,8 +533,15 @@ def _deploy(provider, ansible_host_ip, ssh_key, ssh_user, custom_ami): configuration['exec_containers']): click.echo("-------------------") click.secho("Copy Docker images to cloud registry") - ssh_host_key = SSHHostKey("ssh-rsa", configuration['ansible_ssh_host_key']) - ssh_client = SSHConnector(ip_address=ansible_host_ip, username='ubuntu', host_keys=[ssh_host_key], priv_key=ssh_key) + ssh_host_key_rsa = SSHHostKey("ssh-rsa", configuration['ansible_ssh_rsa_host_key']) + ssh_host_key_ecdsa = SSHHostKey("ecdsa", configuration['ansible_ssh_ecdsa_host_key']) + ssh_host_key_ed25519 = SSHHostKey("ed25519", configuration['ansible_ssh_ed25519_host_key']) + ssh_client = SSHConnector( + ip_address=ansible_host_ip, + username='ubuntu', + host_keys=[ssh_host_key_rsa, ssh_host_key_ecdsa, ssh_host_key_ed25519], + priv_key=ssh_key + ) click.echo(configuration['exec_containers']) click.echo(f"From registry: {configuration['replicate_from_container_registry']}") docker_mgmt = DockerManagement(from_registry=configuration['replicate_from_container_registry'], diff --git a/collections/requirements.yml b/collections/requirements.yml index ea5eefe0..78d16b12 100644 --- a/collections/requirements.yml +++ b/collections/requirements.yml @@ -1,4 +1,8 @@ collections: - name: https://github.com/kubernetes-sigs/kubespray type: git - version: d646053c0e7fdc0dc95661d1b5ab72dd61ab0576 + version: 6b1188e3dcce4b1afa10c79cd23bf86fbc987e0a + + - name: https://github.com/ansible-collections/kubernetes.core + type: git + version: 2.4.0 diff --git a/collections/share/meta/runtime.yml b/collections/share/meta/runtime.yml new file mode 100644 index 00000000..2333e176 --- /dev/null +++ b/collections/share/meta/runtime.yml @@ -0,0 +1 @@ +requires_ansible: ">=2.14.0" diff --git a/collections/share/roles/install_gpu_driver/defaults/main.yml b/collections/share/roles/install_gpu_driver/defaults/main.yml index d4c7984e..306deb4a 100644 --- a/collections/share/roles/install_gpu_driver/defaults/main.yml +++ b/collections/share/roles/install_gpu_driver/defaults/main.yml @@ -11,56 +11,145 @@ gpu_repo_ubuntu_url: https://repositories.intel.com/gpu/ubuntu gpu_repo_spec_u2204_server: "jammy/production/2328 unified" gpu_repo_spec_u2204_client: "jammy client" -kernel_dkms_packages: +gpu_repo_rhel92_url: https://repositories.intel.com/gpu/rhel/9.2/production/2328/unified/intel-gpu-9.2.repo + +kernel_ubuntu_dkms_packages: - gawk - dkms - libc6-dev +kernel_rhel_dkms_packages: + - gawk + - dkms + # intel server gpu packages -gpu_kmd_packages_u2204_20230912_server: - - intel-i915-dkms=1.23.6.29.230425.38+i53-1 - - intel-fw-gpu=2023.30.2-233~22.04 +gpu_kmd_packages_u2204_20231219_server: + - intel-i915-dkms=1.23.6.42.230425.56+i81-1 + - intel-fw-gpu=2023.39.2-255~22.04 -gpu_umd_rt_packages_u2204_20230912_server: - - intel-opencl-icd=23.22.26516.29-682~22.04 - - intel-level-zero-gpu=1.3.26516.29-682~22.04 +gpu_umd_rt_packages_u2204_20231219_server: + - intel-opencl-icd=23.22.26516.34-682~22.04 + - intel-level-zero-gpu=1.3.26516.34-682~22.04 - level-zero=1.11.0-649~22.04 - - intel-media-va-driver-non-free=23.2.4-678~22.04 - - libmfx1=23.2.2-678~22.04 - - libmfxgen1=23.2.4-678~22.04 - - libvpl2=2023.3.0.0-678~22.04 - - libegl-mesa0=23.2.0.20230712.1-2073~22.04 - - libegl1-mesa=23.2.0.20230712.1-2073~22.04 - - libegl1-mesa-dev=23.2.0.20230712.1-2073~22.04 - - libgbm1=23.2.0.20230712.1-2073~22.04 - - libgl1-mesa-dev=23.2.0.20230712.1-2073~22.04 - - libgl1-mesa-dri=23.2.0.20230712.1-2073~22.04 - - libglapi-mesa=23.2.0.20230712.1-2073~22.04 - - libgles2-mesa-dev=23.2.0.20230712.1-2073~22.04 - - libglx-mesa0=23.2.0.20230712.1-2073~22.04 + - intel-media-va-driver-non-free=23.2.4-682~22.04 + - libmfx1=23.2.2-682~22.04 + - libmfxgen1=23.2.4-682~22.04 + - libvpl2=2023.3.0.0-682~22.04 + - libegl-mesa0=24.0.0.20231114.1-2088~22.04 + - libegl1-mesa=24.0.0.20231114.1-2088~22.04 + - libegl1-mesa-dev=24.0.0.20231114.1-2088~22.04 + - libgbm1=24.0.0.20231114.1-2088~22.04 + - libgl1-mesa-dev=24.0.0.20231114.1-2088~22.04 + - libgl1-mesa-dri=24.0.0.20231114.1-2088~22.04 + - libglapi-mesa=24.0.0.20231114.1-2088~22.04 + - libgles2-mesa-dev=24.0.0.20231114.1-2088~22.04 + - libglx-mesa0=24.0.0.20231114.1-2088~22.04 - libigdgmm12=22.3.7-678~22.04 - - libxatracker2=23.2.0.20230712.1-2073~22.04 - - mesa-va-drivers=23.2.0.20230712.1-2073~22.04 - - mesa-vdpau-drivers=23.2.0.20230712.1-2073~22.04 - - mesa-vulkan-drivers=23.2.0.20230712.1-2073~22.04 - - va-driver-all=2.19.0.2-64~u22.04 - -gpu_dev_packages_u2204_20230912_server: - - libigc1=1.0.14062.15-682~22.04 - - libigc-dev=1.0.14062.15-682~22.04 + - libxatracker2=24.0.0.20231114.1-2088~22.04 + - mesa-va-drivers=24.0.0.20231114.1-2088~22.04 + - mesa-vdpau-drivers=24.0.0.20231114.1-2088~22.04 + - mesa-vulkan-drivers=24.0.0.20231114.1-2088~22.04 + - va-driver-all=2.20.0.2-75~u22.04 + +gpu_dev_packages_u2204_20231219_server: + - libigc1=1.0.14062.19-682~22.04 + - libigc-dev=1.0.14062.19-682~22.04 - intel-igc-cm=1.0.202-682~22.04 - - libigdfcl1=1.0.14062.15-682~22.04 - - libigdfcl-dev=1.0.14062.15-682~22.04 - - libigfxcmrt7=23.2.4-678~22.04 - - libigfxcmrt-dev=23.2.4-678~22.04 + - libigdfcl1=1.0.14062.19-682~22.04 + - libigdfcl-dev=1.0.14062.19-682~22.04 + - libigfxcmrt7=23.2.4-682~22.04 + - libigfxcmrt-dev=23.2.4-682~22.04 - level-zero-dev=1.11.0-649~22.04 - - libvpl-dev=2023.3.0.0-678~22.04 + - libvpl-dev=2023.3.0.0-682~22.04 + +gpu_tool_packages_u2204_20231219_server: + - xpu-smi=1.2.22-34~22.04 + +gpu_kmd_packages_rhel92_20230929_server: + - intel-i915-dkms-1.23.6.37.230425.49-114 + - intel-fw-gpu-2023.39.2-255.el9 + +gpu_umd_rt_packages_rhel92_20230929_server: + - intel-media-23.2.4-i682.el9_2 + - intel-mediasdk-23.2.2-i682.el9_2 + - libmfxgen1-23.2.4-i682.el9_2 + - level-zero-1.11.0-682.el9_2 + - intel-level-zero-gpu-1.3.26516.32-682.el9_2 + - mesa-dri-drivers-23.2.0.20230712.1-2077.el9_2 + - libdrm-2.4.114-1.el9 + - mesa-libGL-23.2.0.20230712.1-2077.el9_2 + - intel-metrics-discovery-1.12.166-682.el9_2 + - intel-metrics-library-1.0.133-682.el9_2 + - intel-igc-core-1.0.14062.16-682.el9_2 + - libva-2.19.0.2-66.el9_2 + - intel-gmmlib-22.3.7-i682.el9_2 + - libmetee-3.1.5-65.el8_8 + - intel-gsc-0.8.9-65.el8_8 + - intel-igc-cm-1.0.202-682.el9_2 + - intel-ocloc-23.22.26516.32-682.el9_2 + - intel-opencl-23.22.26516.32-682.el9_2 + - libva-utils-2.19.0.2-1.0.61.el9_2 + - libvpl-tools-2023.3.0.0-i682.el9_2 + - libvpl2-2023.3.0.0-i682.el9_2 + - mesa-libEGL-23.2.0.20230712.1-2077.el9_2 + - mesa-libgbm-23.2.0.20230712.1-2077.el9_2 + - mesa-libxatracker-23.2.0.20230712.1-2077.el9_2 + - mesa-vdpau-drivers-23.2.0.20230712.1-2077.el9_2 + - mesa-vulkan-drivers-23.2.0.20230712.1-2077.el9_2 + +gpu_dev_packages_rhel92_20230929_server: + - intel-gsc-devel-0.8.9-65.el8_8 + - intel-igc-opencl-devel-1.0.14062.16-682.el9_2 + - level-zero-devel-1.11.0-682.el9_2 + - libmetee-devel-3.1.5-65.el8_8 + +gpu_tool_packages_rhel92_20230929_server: + - xpu-smi-1.2.16-27.el9_2 -gpu_tool_packages_u2204_20230912_server: - - xpu-smi=1.2.16-27~22.04 +# intel client gpu packages +gpu_kmd_packages_u2204_20231219_client: + - intel-i915-dkms=1.23.9.11.231003.15+i19-1 + - intel-fw-gpu=2023.39.2-255~22.04 + +gpu_umd_rt_packages_u2204_20231219_client: + - intel-opencl-icd=23.35.27191.42-775~22.04 + - intel-level-zero-gpu=1.3.27191.42-775~22.04 + - level-zero=1.14.0-744~22.04 + - intel-media-va-driver-non-free=23.4.0-775~22.04 + - libmfx1=23.2.2-775~22.04 + - libmfxgen1=23.4.0-775~22.04 + - libvpl2=2023.3.1.0-775~22.04 + - libegl-mesa0=24.0.0.20231114.1-2088~22.04 + - libegl1-mesa=24.0.0.20231114.1-2088~22.04 + - libegl1-mesa-dev=24.0.0.20231114.1-2088~22.04 + - libgbm1=24.0.0.20231114.1-2088~22.04 + - libgl1-mesa-dev=24.0.0.20231114.1-2088~22.04 + - libgl1-mesa-dri=24.0.0.20231114.1-2088~22.04 + - libglapi-mesa=24.0.0.20231114.1-2088~22.04 + - libgles2-mesa-dev=24.0.0.20231114.1-2088~22.04 + - libglx-mesa0=24.0.0.20231114.1-2088~22.04 + - libigdgmm12=22.3.12-742~22.04 + - libxatracker2=24.0.0.20231114.1-2088~22.04 + - mesa-va-drivers=24.0.0.20231114.1-2088~22.04 + - mesa-vdpau-drivers=24.0.0.20231114.1-2088~22.04 + - mesa-vulkan-drivers=24.0.0.20231114.1-2088~22.04 + - va-driver-all=2.20.0.2-75~u22.04 + +gpu_dev_packages_u2204_20231219_client: + - libigc1=1.0.15136.24-775~22.04 + - libigc-dev=1.0.15136.24-775~22.04 + - intel-igc-cm=1.0.206-775~22.04 + - libigdfcl1=1.0.15136.24-775~22.04 + - libigdfcl-dev=1.0.15136.24-775~22.04 + - libigfxcmrt7=23.4.0-775~22.04 + - libigfxcmrt-dev=23.4.0-775~22.04 + - level-zero-dev=1.14.0-744~22.04 + - libvpl-dev=2023.3.1.0-775~22.04 + +gpu_tool_packages_u2204_20231219_client: + - xpu-smi=1.2.22-34~22.04 -# intel client gpu packages gpu_kmd_packages_u2204_20230912_client: - intel-i915-dkms=1.23.7.17.230608.24+i37-1 - intel-fw-gpu=2023.35.5-247~22.04 @@ -103,55 +192,63 @@ gpu_dev_packages_u2204_20230912_client: gpu_tool_packages_u2204_20230912_client: - xpu-smi=1.2.16-27~22.04 - # MTL gpu related pcakges based on Beta BKC gpu_kmd_packages_u2204_mtl: - - linux-firmware=20220329.git681281e4-0ubuntu3.17-1ppa1~jammy3 + - linux-firmware gpu_umd_rt_packages_u2204_mtl: - - intel-media-va-driver-non-free=23.2.4-1ppa1~jammy7 - - libmfx1=23.2.2-1ppa1~jammy1 - - libmfx-gen1.2=23.2.4-1ppa1~jammy1 - - libvpl2=2023.3.0-1ppa1~jammy4 - - libegl-mesa0=23.1.5-1ppa1~jammy1 - - libegl1-mesa=23.0.2-1ppa1~jammy3 - - libegl1-mesa-dev=23.0.2-1ppa1~jammy3 - - libgbm1=23.1.5-1ppa1~jammy1 - - libgl1-mesa-dev=23.1.5-1ppa1~jammy1 - - libgl1-mesa-dri=23.1.5-1ppa1~jammy1 - - libglapi-mesa=23.1.5-1ppa1~jammy1 - - libgles2-mesa-dev=23.1.5-1ppa1~jammy1 - - libglx-mesa0=23.1.5-1ppa1~jammy1 - - libigdgmm12=22.3.7-1ppa1~jammy1 - - libxatracker2=23.1.5-1ppa1~jammy1 - - mesa-va-drivers=23.1.5-1ppa1~jammy1 - - mesa-vdpau-drivers=23.1.5-1ppa1~jammy1 - - mesa-vulkan-drivers=23.1.5-1ppa1~jammy1 - - va-driver-all=2.19.0-1ppa1~jammy1 - - libva2=2.19.0-1ppa1~jammy1 + - intel-media-va-driver-non-free + - libmfx1 + - libmfx-gen1.2 + - libvpl2 + - libegl-mesa0 + - libegl1-mesa + - libegl1-mesa-dev + - libgbm1 + - libgl1-mesa-dev + - libgl1-mesa-dri + - libglapi-mesa + - libgles2-mesa-dev + - libglx-mesa0 + - libigdgmm12 + - libxatracker2 + - mesa-va-drivers + - mesa-vdpau-drivers + - mesa-vulkan-drivers + - va-driver-all + - libva2 + - libva-drm2 + - libva-glx2 + - libva-wayland2 + - libva-x11-2 gpu_dev_packages_u2204_mtl: - - libigfxcmrt7=23.2.4-1ppa1~jammy7 - - libigfxcmrt-dev=23.2.4-1ppa1~jammy7 - - libvpl-dev=2023.3.0-1ppa1~jammy4 - - libmfx-gen-dev=23.2.4-1ppa1~jammy1 - - libva-dev=2.19.0-1ppa1~jammy1 - - mesa-common-dev=23.1.5-1ppa1~jammy1 - - libxatracker-dev=23.1.5-1ppa1~jammy1 + - libigfxcmrt7 + - libigfxcmrt-dev + - libvpl-dev + - libmfx-gen-dev + - libva-dev + - mesa-common-dev + - libxatracker-dev gpu_system_packages_u2204_mtl: - - libwayland-egl1-mesa=23.0.2-1ppa1~jammy3 - - libweston-9-0=9.0.0-4ppa1~jammy2 - - libweston-9-dev=9.0.0-4ppa1~jammy2 - - weston=9.0.0-4ppa1~jammy2 + - libwayland-egl1-mesa + - libweston-10-0 + - libweston-10-dev + - weston - xserver-xorg-core -# - weston=10.0.0-1ppa1~jammy1 # intel gpu release independent test packages -gpu_test_packages: +gpu_ubuntu_test_packages: - hwinfo - vainfo - clinfo - mesa-utils - vulkan-tools - intel-gpu-tools + - onevpl-tools + +gpu_rhel_test_packages: + - hwinfo + - clinfo + - vulkan-tools diff --git a/collections/share/roles/install_gpu_driver/files/cek_detect_gpu_type.py b/collections/share/roles/install_gpu_driver/files/cek_detect_gpu_type.py index 6ca557f3..484ddac1 100644 --- a/collections/share/roles/install_gpu_driver/files/cek_detect_gpu_type.py +++ b/collections/share/roles/install_gpu_driver/files/cek_detect_gpu_type.py @@ -1,6 +1,7 @@ import os import sys +import re intel_dgpu_types = { "56c0" : "Flex", @@ -31,16 +32,15 @@ def detect_gpu_type(): - cmd = 'lspci | grep -i -E "Display|VGA" | grep Intel' + cmd = 'lspci -mmnn | grep -i -E "Display|VGA" | grep Intel' result = os.popen(cmd) info_list = result.read() lines = info_list.splitlines() line_count = len(lines) if line_count > 0 : line = lines[0] - idx1 = line.find("Device") + len("Device ") - idx2 = line.find(" ", idx1+1) - chip_id = line[idx1 : idx2].lower() + device = re.findall(r'\"(.*?)\"', line)[2] + chip_id = (device.split("["))[1].split("]")[0] if chip_id in intel_dgpu_types : gpu_type = intel_dgpu_types[chip_id] diff --git a/collections/share/roles/install_gpu_driver/files/cek_get_latest_gpu_pkgs.sh b/collections/share/roles/install_gpu_driver/files/cek_get_latest_gpu_pkgs.sh index caf4aa94..4f65a427 100755 --- a/collections/share/roles/install_gpu_driver/files/cek_get_latest_gpu_pkgs.sh +++ b/collections/share/roles/install_gpu_driver/files/cek_get_latest_gpu_pkgs.sh @@ -46,6 +46,8 @@ apt-cache madison libigdfcl-dev | head -n 1 apt-cache madison libigfxcmrt7 | head -n 1 apt-cache madison libigfxcmrt-dev | head -n 1 apt-cache madison level-zero-dev | head -n 1 +apt-cache madison libvpl-dev | head -n 1 + echo "gpu_tool_packages :" apt-cache madison xpu-smi | head -n 1 diff --git a/collections/share/roles/install_gpu_driver/tasks/debian.yml b/collections/share/roles/install_gpu_driver/tasks/debian.yml index 0d42f756..0bdb365c 100644 --- a/collections/share/roles/install_gpu_driver/tasks/debian.yml +++ b/collections/share/roles/install_gpu_driver/tasks/debian.yml @@ -85,10 +85,8 @@ - name: Install kernel dkms packages ansible.builtin.apt: - name: "{{ kernel_dkms_packages }}" + name: "{{ kernel_ubuntu_dkms_packages }}" -# AXG 20230714 out of tree driver can't pass compilation on kernel 6.2, -# use kernel in tree driver as workaround when kernel version > 6.2. - name: Install gpu kernel mode driver packages ansible.builtin.apt: name: "{{ gpu_kmd_packages }}" @@ -114,7 +112,7 @@ - name: Install gpu test packages ansible.builtin.apt: - name: "{{ gpu_test_packages }}" + name: "{{ gpu_ubuntu_test_packages }}" allow_downgrade: true - name: Reboot the system for these changes to take effect diff --git a/collections/share/roles/install_gpu_driver/tasks/debian_mtl.yml b/collections/share/roles/install_gpu_driver/tasks/debian_mtl.yml index 7bae3e13..89175fd9 100644 --- a/collections/share/roles/install_gpu_driver/tasks/debian_mtl.yml +++ b/collections/share/roles/install_gpu_driver/tasks/debian_mtl.yml @@ -10,13 +10,13 @@ - name: Add MTL deb repo ansible.builtin.apt_repository: filename: "intel-mtl" - repo: "deb https://download.01.org/intel-linux-overlay/ubuntu jammy main non-free multimedia" + repo: "deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/mtl.gpg] https://download.01.org/intel-linux-overlay/ubuntu jammy main non-free multimedia" state: present - name: Add MTL deb-src repo ansible.builtin.apt_repository: filename: "intel-mtl" - repo: "deb-src https://download.01.org/intel-linux-overlay/ubuntu jammy main non-free multimedia" + repo: "deb-src [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/mtl.gpg] https://download.01.org/intel-linux-overlay/ubuntu jammy main non-free multimedia" state: present update_cache: true @@ -26,7 +26,7 @@ mode: '0644' content: | Package: * - Pin: release o= intel-iot-linux-overlay + Pin: release o=intel-iot-linux-overlay Pin-Priority: 2000 - name: Force probe for MTL GPU and enable GUC @@ -36,8 +36,11 @@ register: boot_entry_cmdline - name: Update boot configure + # noqa: no-handler ansible.builtin.command: "update-grub" changed_when: false + when: + - boot_entry_cmdline.changed - name: Install gpu kmd and firmware packages ansible.builtin.apt: @@ -62,7 +65,7 @@ - name: Install gpu test packages ansible.builtin.apt: - name: "{{ gpu_test_packages }}" + name: "{{ gpu_ubuntu_test_packages }}" allow_downgrade: true - name: allow dmesg for normal account diff --git a/collections/share/roles/install_gpu_driver/tasks/main.yml b/collections/share/roles/install_gpu_driver/tasks/main.yml index 402c21f2..058cb97f 100644 --- a/collections/share/roles/install_gpu_driver/tasks/main.yml +++ b/collections/share/roles/install_gpu_driver/tasks/main.yml @@ -27,41 +27,51 @@ - name: Set repo and packages for server GPU installation on Ubuntu 22.04 set_fact: gpu_repo_spec: "{{ gpu_repo_spec_u2204_server }}" - gpu_kmd_packages: "{{ gpu_kmd_packages_u2204_20230912_server }}" - gpu_umd_rt_packages: "{{ gpu_umd_rt_packages_u2204_20230912_server }}" - gpu_dev_packages: "{{ gpu_dev_packages_u2204_20230912_server }}" - gpu_tool_packages: "{{ gpu_tool_packages_u2204_20230912_server }}" + gpu_kmd_packages: "{{ gpu_kmd_packages_u2204_20231219_server }}" + gpu_umd_rt_packages: "{{ gpu_umd_rt_packages_u2204_20231219_server }}" + gpu_dev_packages: "{{ gpu_dev_packages_u2204_20231219_server }}" + gpu_tool_packages: "{{ gpu_tool_packages_u2204_20231219_server }}" when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==')) - gpu_type == "Flex" - name: Set repo and packages for client GPU installation on Ubuntu 22.04 set_fact: gpu_repo_spec: "{{ gpu_repo_spec_u2204_client }}" gpu_kmd_packages: "{{ gpu_kmd_packages_u2204_20230912_client }}" - gpu_umd_rt_packages: "{{ gpu_umd_rt_packages_u2204_20230912_client }}" - gpu_dev_packages: "{{ gpu_dev_packages_u2204_20230912_client }}" - gpu_tool_packages: "{{ gpu_tool_packages_u2204_20230912_client }}" + gpu_umd_rt_packages: "{{ gpu_umd_rt_packages_u2204_20231219_client }}" + gpu_dev_packages: "{{ gpu_dev_packages_u2204_20231219_client }}" + gpu_tool_packages: "{{ gpu_tool_packages_u2204_20231219_client }}" when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==')) - gpu_type == "Arc" or gpu_type == "iGPU" - name: Install GPU drivers on Ubuntu include_tasks: debian.yml when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==')) - gpu_type != "Unknown" - configured_arch not in ['ultra'] - name: Install MTL GPU drivers on Ubuntu include_tasks: debian_mtl.yml when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==')) - gpu_type != "Unknown" - configured_arch in ['ultra'] -- name: Install GPU drivers on RHEL 8.x +- name: Set repo and packages for server GPU installation on RHEL/Rocky 9.2 + set_fact: + gpu_kmd_packages: "{{ gpu_kmd_packages_rhel92_20230929_server }}" + gpu_umd_rt_packages: "{{ gpu_umd_rt_packages_rhel92_20230929_server }}" + gpu_dev_packages: "{{ gpu_dev_packages_rhel92_20230929_server }}" + gpu_tool_packages: "{{ gpu_tool_packages_rhel92_20230929_server }}" + when: + - (ansible_os_family == "RedHat" and ansible_distribution_version is version('9.2', '==')) + - gpu_type == "Flex" + +- name: Install GPU drivers on RHEL/Rocky 9.2 include_tasks: rhel.yml when: - - (ansible_os_family == "RedHat" and ansible_distribution_version >= "8.6") - - gpu_type != "Unknown" + - (ansible_os_family == "RedHat" and ansible_distribution_version is version('9.2', '==')) + - gpu_type == "Flex" diff --git a/collections/share/roles/install_gpu_driver/tasks/rhel.yml b/collections/share/roles/install_gpu_driver/tasks/rhel.yml index 20dbf79f..9ceb6ec1 100644 --- a/collections/share/roles/install_gpu_driver/tasks/rhel.yml +++ b/collections/share/roles/install_gpu_driver/tasks/rhel.yml @@ -1,3 +1,55 @@ --- -- name: GPU driver installation on RedHat - debug: msg="RedHat installation TBD" +- name: Download Intel Graphics repository + ansible.builtin.get_url: + url: "{{ gpu_repo_rhel92_url }}" + dest: "/etc/yum.repos.d/" + mode: '0644' + +- name: Set fact for kernel version + ansible.builtin.set_fact: + kernel_ver: "{{ ansible_kernel }}" + +- name: Install kernel headers in case it missed + ansible.builtin.dnf: + name: kernel-headers-{{ kernel_ver }} + state: present + +- name: Install kernel dkms packages + ansible.builtin.dnf: + name: "{{ kernel_rhel_dkms_packages }}" + +- name: Install gpu kernel mode driver packages + ansible.builtin.dnf: + name: "{{ gpu_kmd_packages }}" + state: present + notify: + - reboot server + +- name: Install gpu user mode driver and runtime packages + ansible.builtin.dnf: + name: "{{ gpu_umd_rt_packages }}" + state: present + +- name: Install gpu dev packages + ansible.builtin.dnf: + name: "{{ gpu_dev_packages }}" + state: present + +- name: Install gpu tool packages + ansible.builtin.dnf: + name: "{{ gpu_tool_packages }}" + state: present + +- name: Install gpu test packages + ansible.builtin.dnf: + name: "{{ gpu_rhel_test_packages }}" + state: present + +- name: Add support for multi-gpu system + ansible.builtin.lineinfile: + path: /etc/default/grub + line: GRUB_CMDLINE_LINUX="${GRUB_CMDLINE_LINUX} pci=realloc=off" + register: boot_entry_cmdline + notify: + - Update grub on RedHat systems + - reboot server diff --git a/docs/adq.md b/docs/adq.md index a34ecaab..c43f8ea0 100644 --- a/docs/adq.md +++ b/docs/adq.md @@ -124,9 +124,6 @@ spec: nodeSelector: kubernetes.io/hostname: "" # Update this line with the hostname of the controller tolerations: - - key: "node-role.kubernetes.io/master" - effect: "NoSchedule" - operator: "Exists" - key: "node-role.kubernetes.io/control-plane" effect: "NoSchedule" operator: "Exists" diff --git a/docs/calico_vpp.md b/docs/calico_vpp.md index 6593edfc..18168662 100644 --- a/docs/calico_vpp.md +++ b/docs/calico_vpp.md @@ -4,34 +4,35 @@ The Calico VPP dataplane is in beta and should not be used in production cluster Calico VPP dataplane has only been tested on **Ubuntu 22.04 LTS**. Due to the Kubespray does not support Calico VPP dataplane yet, so we choose to install a very basic setup without any actual mesh capable CNI in kubespray stage, then install calico and calico vpp dataplane with operator based installations in `roles/calico_vpp_install`. Besides due to the Calico VPP network changes, compatilibity with other feautes in the Reference Architecture can not be fully guaranteed. -This configuration guide assumes that the **"basic"** profile is being used as a starting point. It also assumes that the deployment is on single node currently (will support 1 controller and 1 worker later). +This configuration guide assumes that the **"basic"** profile is being used as a starting point. It now can deploy among nodes(here, we take 1C+2W=2H as example). ## Prepare target servers Calico VPP requires a dedicated interface from an Intel® Ethernet 800 Series Network Adapter. Start by configuring the inteface on machine based on the following rules: * The interfaces must be from an Intel® Ethernet 800 Series Network Adapter -* The interfaces must have persistent IP addresses, e.g. 12.1.152.169/8 -* The servers must be reachable by the Ansible host from a separate interface and IP, e.g. 10.166.31.141/23 -``` -+-------------------------+ -| Master Node | -| 10.166.31.141/23 | -| | -| | -| +---------------------+ -| |E810 | -| |12.1.152.169/8 | -| +---------------------+ -| | -+-------------------------+ +* The interfaces must have persistent IP addresses, e.g. 10.10.10.10, 10.10.10.11 +* The servers must be reachable by the Ansible host from a separate interface and IP, e.g. 192.168.100.100, 192.168.100.101 +``` ++-------------------------+ +-------------------------+ +| Master Node | | Worker Node | +| 192.168.100.100 | | 192.168.100.101 | +| | | | +| | | | +| +---------------------+ +---------------------+ | +| |ens108 | |ens108 | | +| |10.10.10.10 |----------|10.10.10.11 | | +| +---------------------+ +---------------------+ | +| | | | ++-------------------------+ +-------------------------+ ``` ## Ansible Configuration The `inventory.ini` file must be updated with the persistent IP addresses assigned to each server. An example of this configuration can be seen below: ``` [all] - ansible_host=10.166.31.141 ip=12.1.152.169 ansible_user=USER ansible_password=XXXX + ansible_host=192.168.100.100 ip=10.10.10.10 ansible_user=USER ansible_password=XXXX + ansible_host=192.168.100.101 ip=10.10.10.11 ansible_user=USER ansible_password=XXXX localhost ansible_connection=local ansible_python_interpreter=/usr/bin/python3 ``` @@ -40,10 +41,16 @@ The following variables in `group_vars/all.yml` must be changed to support Calic kube_network_plugin: cni calico_network_backend: vxlan kube_network_plugin_multus: false -hugepages_enabled: true -number_of_hugepages_1G: 16 calico_vpp: enabled: true + +``` + +The following variables in `host_vars/.yml` must be changed to support Calico VPP: +``` +hugepages_enabled: true +number_of_hugepages_1G: 16 +install_dpdk: true ``` ## Post Deployment @@ -52,24 +59,33 @@ Once the deployment has completed, check the status of the Calico VPP deployment Check node status: ``` # kubectl get nodes -A -o wide -NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME - Ready control-plane 12h v1.27.1 12.1.152.169 Ubuntu 22.04.2 LTS 5.15.0-72-generic docker://20.10.20 +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME + Ready control-plane 15h v1.28.3 10.10.10.10 Ubuntu 22.04.2 LTS 5.15.0-72-generic containerd://1.7.8 + Ready 15h v1.28.3 10.10.10.11 Ubuntu 22.04.2 LTS 5.15.0-72-generic containerd://1.7.8 + +# kubectl describe node | grep projectcalico +projectcalico.org/IPv4Address: 10.10.10.10 +projectcalico.org/IPv4VXLANTunnelAddr: 10.244.143.64 -# kubectl describe node | grep projectcalico -projectcalico.org/IPv4Address: 12.1.152.169/8 -projectcalico.org/IPv4VXLANTunnelAddr: 10.244.66.64 +# kubectl describe node | grep projectcalico +projectcalico.org/IPv4Address: 10.10.10.11 +projectcalico.org/IPv4VXLANTunnelAddr: 10.244.200.193 ``` Check pod status: ``` # kubectl get pods -n calico-vpp-dataplane -NAME READY STATUS RESTARTS AGE -calico-vpp-node-48pnm 2/2 Running 0 12h +NAME READY STATUS RESTARTS AGE +calico-vpp-node-9swvq 2/2 Running 1 (15h ago) 15h +calico-vpp-node-m42k9 2/2 Running 2 (15h ago) 15h ``` Check E810 NIC interfaces for VPP: ``` -# ethtool -i xxx +# ethtool -i ens108 driver: tun version: 1.6 +firmware-version: +expansion-rom-version: +bus-info: tap ... ``` Check the configured network subnet for containers: @@ -78,53 +94,57 @@ Check the configured network subnet for containers: NAME CIDR NAT IPIPMODE VXLANMODE DISABLED DISABLEBGPEXPORT SELECTOR default-ipv4-ippool 10.244.0.0/16 true Never CrossSubnet false false all() ``` -Check calico vpp status: +Check calico vpp status(take master node as example): ``` -# calivppctl vppctl +# calivppctl vppctl _______ _ _ _____ ___ __/ __/ _ \ (_)__ | | / / _ \/ _ \ _/ _// // / / / _ \ | |/ / ___/ ___/ /_/ /____(_)_/\___/ |___/_/ /_/ -# show hardware-interfaces -Name Idx Link Hardware -TwentyFiveGigabitEthernet43/0/3 1 up TwentyFiveGigabitEthernet43/0/3 +vpp# show hardware-interfaces +Name Idx Link Hardware +HundredGigabitEthernet98/0/0 1 up HundredGigabitEthernet98/0/0 + Link speed: 100 Gbps ... - -# show int addr -TwentyFiveGigabitEthernet43/0/3 (up): - L3 12.1.152.169/8 +vpp# show int addr +HundredGigabitEthernet98/0/0 (up): + L3 10.10.10.10/8 ... tap0 (up): - L3 12.1.152.169/32 ip4 table-id 1013904223 fib-idx 3 + L3 10.10.10.10/32 ip4 table-id 1013904223 fib-idx 3 ``` Create 2 test pods for simple check: ``` -# kubectl run test --image=busybox --command -- tail -f /dev/null +# kubectl run test --image=busybox --overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": ""}}}' --command -- tail -f /dev/null pod/test created # kubectl exec test -- ip a show dev eth0 2: eth0: mtu 1450 qdisc mq qlen 500 link/[65534] - inet 10.244.66.100/32 scope global eth0 + inet 10.244.143.87/32 scope global eth0 + valid_lft forever preferred_lft forever + inet6 fe80::3e9d:58cb:257b:93ee/64 scope link flags 800 valid_lft forever preferred_lft forever -# kubectl run test1 --image=busybox --command -- tail -f /dev/null +# kubectl run test1 --image=busybox --overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": ""}}}' --command -- tail -f /dev/null pod/test1 created # kubectl exec test1 -- ip a show dev eth0 2: eth0: mtu 1450 qdisc mq qlen 500 link/[65534] - inet 10.244.66.101/32 scope global eth0 + inet 10.244.200.215/32 scope global eth0 + valid_lft forever preferred_lft forever + inet6 fe80::d8cc:3cd5:45a4:4310/64 scope link flags 800 valid_lft forever preferred_lft forever -# kubectl exec test1 -- ping 10.244.66.100 -PING 10.244.66.100 (10.244.66.100): 56 data bytes -64 bytes from 10.244.66.100: seq=0 ttl=63 time=0.327 ms -64 bytes from 10.244.66.100: seq=1 ttl=63 time=0.823 ms -64 bytes from 10.244.66.100: seq=2 ttl=63 time=0.417 ms +# kubectl exec test1 -- ping 10.244.143.87 +PING 10.244.143.87 (10.244.143.87): 56 data bytes +64 bytes from 10.244.143.87: seq=0 ttl=62 time=0.665 ms +64 bytes from 10.244.143.87: seq=1 ttl=62 time=1.525 ms +64 bytes from 10.244.143.87: seq=2 ttl=62 time=0.741 ms # calicoctl get workloadEndpoint -o wide -NAME WORKLOAD NODE NETWORKS INTERFACE PROFILES NATS -xxx-k8s-test-eth0 test 10.244.66.100/32 cali1037a54e65e kns.default,ksa.default.default -xxx-k8s-test1-eth0 test1 10.244.66.101/32 cali99c376db89a kns.default,ksa.default.default +NAME WORKLOAD NODE NETWORKS INTERFACE PROFILES NATS +xxx-k8s-test-eth0 test 10.244.143.87/32 cali1037a54e65e kns.default,ksa.default.default +xxx-k8s-test1-eth0 test1 10.244.200.215/32 cali99c376db89a kns.default,ksa.default.default ``` diff --git a/docs/emr.md b/docs/emr.md index c2e2be1e..acd29aff 100644 --- a/docs/emr.md +++ b/docs/emr.md @@ -1,10 +1,10 @@ # EMR platform configuration guide -This guide introdues how to enable RA on the Intel EMR platforms. +This guide introduces how to enable RA on the Intel EMR platforms. ## BMRA configuration ### QAT Driver -Download the EMR QAT driver package and put it in the folder ``/tmp/emr_qat/`` folder on the ansible host machine. Then configure the QAT related operations in the files in the ``group_vars`` and ``host_vars`` referring to the security session in the below url +Download the EMR QAT driver package and put it in the folder ``/tmp/nda_qat/`` on the ansible host machine. Then configure the QAT related operations in the ``group_vars`` and ``host_vars`` files referring to the security session in the below url ### DPDK driver diff --git a/docs/generate_profiles.md b/docs/generate_profiles.md index 2d84f18c..d4343533 100644 --- a/docs/generate_profiles.md +++ b/docs/generate_profiles.md @@ -6,8 +6,9 @@ 4. [Discover Supported Ethernet Network Adapters](#discover-supported-ethernet-network-adapters) 5. [Discover Supported Profiles](#discover-supported-profiles) 6. [Discover Additional Configuration](#discover-additional-configuration) -7. [Example Commands](#example-commands) -8. [Playbooks Generation](#playbook-generation) +7. [Architecture and Ethernet Network Adapters type auto-detection](#architecture-and-ethernet-network-adapters-type-auto-detection) +8. [Example Commands](#example-commands) +9. [Playbooks Generation](#playbook-generation) --- @@ -65,9 +66,10 @@ These three directories represent available modes of the CEK project. At the moment, Container Experience Kits supports the following machine architectures: +* `gnr` - Granite Rapids - 'Next Generation Intel(R) Xeon(R) Scalable Processor' * `emr` - Emerald Rapids - '5th Generation Intel(R) Xeon(R) Scalable Processor' -* `spr` - Sapphire Rapids - '4th Generation Intel(R) Xeon(R) Scalable Processor' -* `icx` - IceLake (default) - '3rd Generation Intel(R) Xeon(R) Scalable Processor' +* `spr` - Sapphire Rapids (default) - '4th Generation Intel(R) Xeon(R) Scalable Processor' +* `icx` - IceLake - '3rd Generation Intel(R) Xeon(R) Scalable Processor' * `clx` - CascadeLake - '2nd Generation Intel(R) Xeon(R) Scalable Processor' * `skl` - SkyLake - '1st Generation Intel(R) Xeon(R) Scalable Processor' @@ -103,6 +105,19 @@ At the moment, Container Experience Kits supports the following optional configu * Configure mirrors for kubespray deployment - for detailed information, please read [mirrors guide](docs/mirrors.md) +## Architecture and Ethernet Network Adapters type auto-detection + +Container Experience Kits can automatically detect your target machines architecture and Ethernet Network Adapter type. + +```bash +# for k8s mode +make auto-k8s-profile PROFILE=remote_fp HOSTS=10.10.10.11,10.10.10.12 USERNAME=root +# or for vm mode +make auto-vm-profile PROFILE=remote_fp HOSTS=10.10.10.11,10.10.10.12 USERNAME=root +``` + +> **_NOTE:_** Make sure that SSH key is copied to all Kubernetes cluster nodes or VM hosts (`ssh-copy-id @` command can be used for that). + ## Example Commands To generate files needed for deployment of `remote_fp` profile, for `Sapphire Rapids` machines, in `k8s` mode, with `cvl` Ethernet Network Adapter the following command must be executed: diff --git a/docs/gnr.md b/docs/gnr.md new file mode 100644 index 00000000..7f4be3d3 --- /dev/null +++ b/docs/gnr.md @@ -0,0 +1,147 @@ +# GNR platform configuration guide + +This guide introduces how to enable RA on the Intel GNR platforms. + +## BMRA configuration +### QAT Driver +Download the GNR QAT driver package and put it in the folder ``/tmp/nda_qat/`` on the ansible host machine. Then configure the QAT related operations in the ``group_vars`` and ``host_vars`` files referring to the security section in the below url + + +### DPDK driver +On the GNR platform we will use latest DPDK version. + +### TDX driver + +***Note: Only Ubuntu 22.04 is enabled for Intel TDX in RA*** + +Intel TDX(Trusted Domain Extensions) can deploy hardaware-isolated virtual machines called trusted domains(TDs). Detailed infor can be found [here](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html). + + +To install the TDX, you should enabled the ``configure_tdx`` in the ``host_vars/.yml`` for BMRA. Then follow the [readme](https://github.com/intel-innersource/containers.orchestrators.kubernetes.container-experience-kits#readme) to deploy the RA cluster. + +After the installation, then you need to configure the bios following the guide below to enabled the tdx and install the msr tools via command. + +***Note: TDX BIOS configuration can only be opened w/ TDX kernel, otherwise it will cause boot failure.*** + +``` +apt install msr-tools +``` +![Alt text](images/tdx-bios-configure.png). + +After the bios is correctly configured, then you can run /opt/cek/tdx-tools/utils/check-tdx-host.sh on the host. Then the following output should be get: + +![Alt text](images/tdx-host-check.png) + +## TDVM configuration +### Host environemt preparation + +For TDVM enabling, TDX kernel should be installed first on the host to make sure that the TDX can be successfully configured. You can use the RA existing role ``bootstrap/install_tdx_drivers`` to install the kernel first following the below guide. + +git clone https://github.com/intel-innersource/containers.orchestrators.kubernetes.container-experience-kits.git cek + +``` +cd +make vm-profile PROFILE=$profile ARCH=$arch + + + +ansible-playbook -i inventory.ini playbooks/intel/tdx.yml + + + +``` + + +After the installation is done, follow the guide in the ``TDX driver`` session above to configure the bios correctly. After it is correctly configured, you can obeserve below info via dmesg: +``` +dmesg |grep -i tdx + + +[ 0.000000] Linux version 6.2.16-mvp30v3+7-generic(gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld (GNU Binutils for Ubuntu) 2.38) #mvp30v3+tdx SMP PREEMPT_DYNAMIC Wed Sep 6 11:29:15 CEST 2023 +[ 1.126416] tdx: BIOS enabled: private KeyID range [64, 128) +[ 14.295018] KVM-debug: PASS: single step TDX module emulated CPUID 0 +[ 14.295018] KVM-debug: PASS: single step TDX module emulated RDMSR 0x1a0 +[ 38.271537] tdx: SEAMCALL failed: leaf 254, error 0xc000050500000000. +[ 38.271929] tdx: TDDEBUGCONFIG isn't supported. +[ 38.296005] tdx: TDX module: atributes 0x0, vendor_id 0x8086, major_version 1, minor_version 5, build_date 20230420, build_num 507 +[ 38.296014] tdx: TDX module: features0: fbf +[ 39.228648] tdx: 262659 pages allocated for PAMT. +[ 39.228656] tdx: TDX module initialized. +[ 39.228667] kvm_intel: tdx: max servtds supported per user TD is 1 +[ 39.228676] kvm_intel: tdx: live migration supported +[ 39.228677] kvm_intel: TDX is supported. +``` + +### VMRA deployment with TDVM + +Follow the [readme](https://github.com/intel-innersource/containers.orchestrators.kubernetes.container-experience-kits#readme) to prepare the installation envrionmemt. Then run the command +``` +make vm-profile PROFILE=on_prem ARCH=gnr + +``` + +Then change the ``configure_tdx`` to ``true`` int the ``host_vars/.yml`` and ``host_vars/.yml``, then run the command to deploy the vmra cluster + +For tdx 1.5/1.0 version, it doest not allow pci passthrough into vms due to security consideration, so DO NOT enable the pci passthrough in the host_vars/.yml and dataplane interfaces/qat_devices in host_vars/.yml. + +Below shows the snippet code for vm_host and vm configuration for tdvm. + +Snippet code for pci passthrough configuration in ``host_vars/.yml``: +``` +vms: + (...) +# pci: +# - "18:02.2" # 18:xx.x are example VFs for networking +# - "18:02.3" +# - "18:02.4" +# - "18:02.5" +# - "3d:01.1" # 3x:xx.x are example VFs for QAT +# - "3f:01.1" +``` + +Snippet code for dataplane_interfaces and qat_devices in ``host_vars/.yml``: + +``` +# dataplane interface configuration list +dataplane_interfaces: [] +#dataplane_interfaces: +# - bus_info: "06:00.0" # PCI bus info +# pf_driver: iavf # Driver inside VM +# sriov_numvfs: 0 +# default_vf_driver: "igb_uio" +# - bus_info: "07:00.0" +# pf_driver: iavf +# sriov_numvfs: 0 +# default_vf_driver: "iavf" +# - bus_info: "08:00.0" +# pf_driver: iavf +# sriov_numvfs: 0 +# default_vf_driver: "iavf" +# - bus_info: "09:00.0" +# pf_driver: iavf +# sriov_numvfs: 0 +# default_vf_driver: "igb_uio" + +(...) + +# QAT interface configuration list +qat_devices: [] +#qat_devices: +# - qat_id: "0000:0a:00.0" +# qat_sriov_numvfs: 0 # Has to be set to 0 here to not create any VFs inside VM. + +# - qat_id: "0000:0b:00.0" +# qat_sriov_numvfs: 0 # Has to be set to 0 here to not create any VFs inside VM. +``` + +``` +ansible-playbook -i inventory.ini playbook/vm.yml +``` + + + +## Generic VMRA configuration +No special configuration for GNR, please refer the [VMRA guide](https://networkbuilders.intel.com/solutionslibrary/network-and-edge-virtual-machine-reference-system-architecture-user-guide) for deployment. + +## Cloud RA configuration +Not supported yet, to be done. diff --git a/docs/ipu_setup.md b/docs/ipu_setup.md index 1032d501..1df07ae4 100644 --- a/docs/ipu_setup.md +++ b/docs/ipu_setup.md @@ -91,7 +91,7 @@ To prepare needed deployment environment please follow listed steps from [README step 1. (basic) step 2. section b) step 3. -step 4. (tested with defaults icx/cvl) +step 4. (tested with icx/cvl) step 7. (k8s) step 8. proxy and mirror settings if needed diff --git a/docs/mirrors.md b/docs/mirrors.md index d821bfad..5defd00a 100644 --- a/docs/mirrors.md +++ b/docs/mirrors.md @@ -8,7 +8,7 @@ To generate mirror links in group vars for any profile, set MIRRORS=true for mak Example: ```bash - make k8s-profile PROFILE=full_nfv ARCH=icx NIC=cvl MIRRORS=true + make k8s-profile PROFILE=full_nfv ARCH=spr NIC=cvl MIRRORS=true ``` ## Set correct values for mirror links and file URLs diff --git a/docs/monitoring.md b/docs/monitoring.md index 3f77a0b4..75689313 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -10,17 +10,15 @@ Telemetry stack in RA project is consist of following components: ## How to work with monitoring stack: ## Telemetry Various dashboards are available for monitoring telemetry in the NEP stack with the possibility of creating new dashboards or editing existing ones. -Dashboards can be viewed by accessing Grafana. This can be done in several ways: +Dashboards can be viewed by accessing Grafana. This can be done with following steps: -1. By directly accessing Grafana through an open port: +1. Create port forwarding from Grafana pod on Controller node: - https://:30000 -2. By creating an SSH tunnel with port forwarding (sometimes direct port access can be blocked by network management): + kubectl port-forward -n monitoring service/grafana 30000:grafana-https - ssh -L 30000:localhost:30000 @ - And in the browser open address: +2. in the browser open address: - https://localhost:30000 + https://:30000 In the basic settings, the user "admin" and the password "admin" are set for grafana. We strongly recommend changing your password to a secure option. @@ -30,16 +28,14 @@ Most cluster parameters can be monitored through Grafana. Grafana provides seve Logs can be accessed in several ways. 1. By using Kibana. -Same as Grafana you can access Kibana by two different ways: - - By directly accessing Kibana through an open port: +Same as Grafana you can access Kibana with following steps: + - Create port forwarding from Kibana pod on Controller node: - https://:30001 - - By creating an SSH tunnel with port forwarding: + kubectl port-forward -n monitoring service/kibana 30001:kibana-http - ssh -L 30001:localhost:30001 @ And in the browser open address: - https://localhost:30001 + https://:30001 - Username and login can be optain by these commands: Username: @@ -57,4 +53,4 @@ Same as Grafana you can access Kibana by two different ways: ## Kubernetes dashboard To enable deployment of the Kubernetes dashboard, it is necessary to check whether the variable "kube_dashboard" is in the "on" state. -For accessing kubernetes-dashboard follow [kubespray documentation](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/getting-started.md#accessing-kubernetes-dashboard). \ No newline at end of file +For accessing kubernetes-dashboard follow [kubespray documentation](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/getting-started.md#accessing-kubernetes-dashboard). diff --git a/docs/power_manager.md b/docs/power_manager.md index 4cb29bf2..8c050f87 100644 --- a/docs/power_manager.md +++ b/docs/power_manager.md @@ -2,11 +2,15 @@ 1. [Introduction](#introduction) 2. [Check the existence of sample power pods on the cluster](#check-the-existence-of-sample-power-pods-on-the-cluster) -3. [Check the frequencies which will be set by `balance-performance` Power Profile](#check-the-frequencies-which-will-be-set-by-balance-performance-power-profile) +3. [Obtain frequencies applied by each Power Profile](#obtain-frequencies-applied-by-each-Power-Profile) 4. [Obtain cores on which `balance-performance` Power Profile is applied](#obtain-cores-on-which-balance-performance-power-profile-is-applied) 5. [Check the frequencies on cores](#check-the-frequencies-on-cores) 6. [The Shared Profile](#the-shared-profile) -7. [Known limitations](#known-limitations) +7. [C-States](#c-states) +8. [Uncore Frequency](#uncore-frequency) +9. [Time of Day](#time-of-day) +10. [Scaling Governors](#scaling-governors) +11. [Known limitations](#known-limitations) --- @@ -15,7 +19,7 @@ Intel Power Manager is available for icx, spr, and clx architectures (you can find more about supported architectures in `generate_profiles` docs), and can be enabled in group vars. After a successful deployment, the user can utilize special resources to manipulate cores' frequencies. -Sample pods can be deployed by setting `deploy_example_pods: true` in group vars. +Sample pods can be deployed by setting `deploy_example_pods: true` in group vars and can be defined for each Power Node in specific host vars. The results of Power Manager work can be obtained in the following way: @@ -23,76 +27,58 @@ The results of Power Manager work can be obtained in the following way: ```bash kubectl get pods -n intel-power -NAME READY STATUS RESTARTS AGE -balance-performance-power-pod 1/1 Running 0 21m -balance-power-power-pod 1/1 Running 0 21m -controller-manager-f584c9458-682p5 1/1 Running 0 16h -performance-power-pod 1/1 Running 0 21m -power-node-agent-8cxmp 2/2 Running 0 16h +NAME READY STATUS RESTARTS AGE +balance-performance-power-pod-node1 1/1 Running 0 122m +balance-power-power-pod-node1 1/1 Running 0 122m +controller-manager-765ccfd89b-m42q4 1/1 Running 0 123m +performance-power-pod-node1 1/1 Running 0 122m +power-node-agent-qhz4g 1/1 Running 0 123m ``` > NOTE: output may be different depending on the number of nodes and requested Power Profiles. -Three pods, one for each profile, were deployed. Let's stick to `balance-performance-power-pod`. +Three pods, one for each profile, were deployed. Let's stick to `balance-performance-power-pod-node1`. -## Check the frequencies which will be set by `balance-performance` Power Profile +# Obtain frequencies applied by each Power Profile ```bash -kubectl get PowerProfiles -n intel-power balance-performance-node1 -o yaml -apiVersion: power.intel.com/v1alpha1 -kind: PowerProfile -metadata: - creationTimestamp: "2022-01-25T17:07:08Z" - generation: 1 - name: balance-performance-node1 - namespace: intel-power - resourceVersion: "17538" - uid: 05599219-d042-4b9c-9bbf-42ef67effd24 -spec: - epp: balance_performance - max: 2700 - min: 2500 - name: balance-performance-node1 +kubectl get PowerNodes -A -o yaml ``` -> NOTE: The max/min frequencies may differ on your machine. - -In `spec` the values max and min represent new frequencies that will be set to specific cores. - ## Obtain cores on which `balance-performance` Power Profile is applied ```bash -kubectl get PowerWorkloads -n intel-power balance-performance-node1-workload -o yaml -apiVersion: power.intel.com/v1alpha1 +kubectl get PowerWorkloads -n intel-power balance-performance-node1 -o yaml +apiVersion: power.intel.com/v1 kind: PowerWorkload metadata: - creationTimestamp: "2022-01-26T10:12:00Z" - generation: 1 - name: balance-performance-node1-workload + creationTimestamp: "2023-12-12T10:56:12Z" + generation: 2 + name: balance-performance-node1 namespace: intel-power - resourceVersion: "246287" - uid: f8720a7e-f7b2-4f31-bf4f-2a38ad8a7c07 + resourceVersion: "8030" + uid: 6740161f-45e2-461d-ad41-063f6336b367 spec: - name: balance-performance-node1-workload - nodeInfo: + name: balance-performance-node1 + powerProfile: balance-performance + workloadNodes: containers: - exclusiveCpus: - 2 - - 66 - id: 495d5547a5211774e605c4a2ebe4b9fbcf44fbd056cc08e0847b68143627700a + - 74 + id: containerd://31d40cf10a653f073bffb4fec6456e79be60fac4d838407272188e53e1d66fb8 name: balance-performance-container - pod: balance-performance-power-pod - powerProfile: balance-performance-node1 + pod: balance-performance-power-pod-node1 + powerProfile: balance-performance cpuIds: - 2 - - 66 + - 74 name: node1 - powerProfile: balance-performance-node1 ``` > > NOTE: The cores may differ on your machine. -`balance-performance` Power Profile is applied to core numbers 2 and 66 +`balance-performance` Power Profile is applied to core numbers 2 and 74 You can also check all assigned cores in your Power Nodes with the following command: @@ -104,13 +90,13 @@ kubectl get PowerNodes -A -o yaml ```bash cat /sys/devices/system/cpu/cpu2/cpufreq/scaling_max_freq -2700000 +2825000 cat /sys/devices/system/cpu/cpu2/cpufreq/scaling_min_freq -2500000 -cat /sys/devices/system/cpu/cpu66/cpufreq/scaling_max_freq -2700000 -cat /sys/devices/system/cpu/cpu66/cpufreq/scaling_min_freq -2500000 +2625000 +cat /sys/devices/system/cpu/cpu74/cpufreq/scaling_max_freq +2825000 +cat /sys/devices/system/cpu/cpu74/cpufreq/scaling_min_freq +2625000 ``` In comparison, the core that was not obtained by Power Workload has the following values: @@ -135,6 +121,22 @@ The Shared Profile has either a cluster-wide or single node impact. The Shared W The resources for Shared Profile are not visible in allocatable kubelet resources as cores will be scaled as soon as Shared Workload is deployed. +## C-States + +C-States can be set in host_vars for each node by setting cstates.enabled to true. User can choose to change C-states for Shared Pool, specific PowerProfile or even specific core. + +## Uncore Frequency + +Uncore frequency can be configured on a system-wide, per-package and per-die level, again in host_vars for each node by setting uncore_frequency.enabled to true. + +## Time of Day + +Time of Day can be configured in host_vars by setting time_of_day.enabled to true. Currently, there is known limitation that there can only exist one time of day schedule in the cluster, so please note, that only first schedule in cluster will be applied. + +## Scaling Governors + +Scaling governors first need to have scaling driver configured in host_vars for each node. For choosing specific scaling governor, user can either set up global scaling governor in group_vars, or local scaling governor in host_vars. + ## Known limitations 1. The Performance Power Profile @@ -157,6 +159,4 @@ More than one Shared Power Profile cannot be used on the same node. For example, Shared Profile will grab all cores that are not marked as exclusive - please consider not deploying shared profile if special pods will need access to cores scaled via performance, balance-performance, or balance-power profiles. -Due to strong dependency on AppQoS the list for exclusive CPUs must not be empty even if there are no exclusive CPUs in the kubelet config at the moment. Please put the last core from the machine to the list of exclusive CPUs in host vars in that case. - Shared Workload **may not** obtain all available cores, but will grab ones from the default pool if other profiles released them. diff --git a/docs/profile_overview/BMRA.pdf b/docs/profile_overview/BMRA.pdf index 40b4f8fd..40a8c4f7 100644 Binary files a/docs/profile_overview/BMRA.pdf and b/docs/profile_overview/BMRA.pdf differ diff --git a/docs/profile_overview/CloudRA.pdf b/docs/profile_overview/CloudRA.pdf index 4532be85..4c90272e 100644 Binary files a/docs/profile_overview/CloudRA.pdf and b/docs/profile_overview/CloudRA.pdf differ diff --git a/docs/profile_overview/VMRA.pdf b/docs/profile_overview/VMRA.pdf index b6d67d36..36789ee0 100644 Binary files a/docs/profile_overview/VMRA.pdf and b/docs/profile_overview/VMRA.pdf differ diff --git a/docs/redeploy_cleanup.md b/docs/redeploy_cleanup.md new file mode 100644 index 00000000..4558c1ec --- /dev/null +++ b/docs/redeploy_cleanup.md @@ -0,0 +1,25 @@ +# Cleanup mechanism +1. [Introduction](#introduction) +2. [Run cleanup playbook](#Run-cleanup-playbook) +3. [Run cleanup playbook for specific role/tag](#Run-cleanup-playbook-for-specific-role/tag) + +## Introduction +With cleanup mechanism user can use the redeploy_cleanup playbook to cleanup deployed k8s cluster and prepare for redeploy. User should be able to use tags to cleanup specific features. + +Cleanup mechanism should clean existing deployment, but there is no guarantee to get system OS to the same state as before deployment. + +Deploying different profile after cleanup is not supported. + +Re-run of the same profile without cleanup is supported, but without significant changes to configuration in host/group vars. + +Cleanup of specific features is currently not supported. + +## Run cleanup playbook +```bash + ansible-playbook -i inventory.ini playbooks/redeploy_cleanup.yml +``` + +## Run cleanup playbook for specific tag - in future +```bash + ansible-playbook -i inventory.ini playbooks/redeploy_cleanup.yml --tags "your_tag" +``` diff --git a/docs/sriov.md b/docs/sriov.md index a7e15ebb..8e6421d9 100644 --- a/docs/sriov.md +++ b/docs/sriov.md @@ -42,6 +42,13 @@ dataplane_interfaces: sriov_vfs: [] ``` +`dataplane_interfaces` can be also configured automatically. All compatible NICs will be discovered and configured. Default VF driver is `iavf`, which can be changed by modifying `dataplane_interface_default_vf_driver`. Amount of VFs will be configured to maximum available on your NIC. + +``` +dataplane_interface_default_vf_driver: "iavf" +dataplane_interfaces: [] +``` + Next option defines whether the SRIOV CNI plugin will be installed on the target worker node. Setting it to `true` will cause the Ansible scripts to build and install SRIOV CNI plugin in the `/opt/cni/bin` directory on the target server. ``` sriov_cni_enabled: true` diff --git a/docs/storage.md b/docs/storage.md index 374235fb..88130978 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -41,6 +41,19 @@ rook_ceph: storage_class: "rook-cephfs" # Storage class name storage_type: "cephfs" # Storage type for rook-ceph, supported values[cephfs, block, object] ``` +## Remote Storage +For the remote storage, the container runtime only support containerd and crio. Below is the steps to configure the remote storage functionality: +``` +1. configure the storage node in group_vars/all.ym + storage_nodes: [] #if no setting, all kubenode will be used as storage node. + # storage_nodes: + # - node0 + # - node1 +2. configure the disk in the host_vars/.yml + Described in the disk configuration session. +``` +Then, you can deploy the cluster, after the cluster successfully deployed, you can use the remote storage function in the cluster. + ## Disk Configuration Same as described in local volume static provisioner. diff --git a/docs/vm_cluster_expansion_guide.md b/docs/vm_cluster_expansion_guide.md index 34f18517..43f842be 100644 --- a/docs/vm_cluster_expansion_guide.md +++ b/docs/vm_cluster_expansion_guide.md @@ -40,6 +40,15 @@ vms: - "18:02.7" - "b1:01.3" - "b3:01.3" +# Or in case with auto-configuration of SRIOV NIC and SRIOV QAT + - type: "work" + name: "vm-work-4" + cpu_total: 16 + memory: 20480 + vxlan: 128 + pci: [] + nic_devices_count: 4 + qat_devices_count: 2 ``` New host_vars file needs to be created for added vm-work node. In our case host_vars/vm-work-4.yml. @@ -76,6 +85,15 @@ vms: - "18:02.7" - "b1:01.3" - "b3:01.3" +# Or in case with auto-configuration of SRIOV NIC and SRIOV QAT + - type: "work" + name: "vm-work-5" + cpu_total: 16 + memory: 20480 + vxlan: 128 + pci: [] + nic_devices_count: 4 + qat_devices_count: 2 ``` diff --git a/docs/vm_config_guide.md b/docs/vm_config_guide.md index c041cc99..b43057e6 100644 --- a/docs/vm_config_guide.md +++ b/docs/vm_config_guide.md @@ -50,6 +50,16 @@ qat_devices: qat_sriov_numvfs: 10 ``` +**SRIOV QAT** and **SRIOV NIC** can be also configured automatically. To do so leave `dataplane_interfaces` and `qat_devices` empty, these options will be set automatically during playbook run. Both will be set up with maximum VFs available. +``` +dataplane_interface_default_vf_driver: "iavf" +dataplane_interfaces: [] + +qat_devices: [] +``` + +**_NOTE:_** It's not necessary to configure both SRIOV QAT and SRIOV NIC automatically, one can be configured manually while other is auto-configured. + Next section provides VM related configuration options. The first option defines VM image distribution of cloud image, which will be used inside VMs. Currently supported distributions are: "ubuntu" and "rocky". Default is "ubuntu" @@ -123,17 +133,38 @@ To enable VM cluster name feature uncomment vm_cluster_name parameter in host_va Next section provides definition of VMs, which will be created during deployment process, and which will be used as control and worker nodes there. vms option defines the list of VMs. Each VM is defined by following parameters: -`type` defines type of VM and following types are supported: "ctrl" and "work" +`type` defines type of VM and following types are supported: "ctrl", "work" and "vm" `name` defines hostname for the VM, which is assigned to VM. That name have to be used for corresponding host_vars file. e.g.: host_vars/vm-work-1.yml `cpu_total` defines total number of CPUs assigned to VM. If value `0` is added here then all available CPUs from one NUMA node are assigned to this VM. If value `0` is added together with optional parameter `alloc_all: true` then all available CPUs from VM host are assigned to this VM. `memory` defines amount of memory assigned to VM in MB `vxlan` defines vxlan id of the vxlan network, where VM will be connected to. It has to be the one, which was added to dhcp parameter above. -`pci` defines list of PCI devices assigned to VM. It contains PCI ids for SRIOV NIC VFs and SRIOV QAT VFs which are assigned to VM. The list can be empty as well. PCI section is relevant only for VM type `work`. In example configuration bellow we've assigned 4 NIC VFs and 2 QAT VFs. +`pci` defines list of PCI devices assigned to VM. It contains PCI ids for SRIOV NIC VFs and SRIOV QAT VFs which are assigned to VM. The list can be empty as well. PCI section is relevant only for VM types `work` and `vm`. In example configuration bellow we've assigned 4 NIC VFs and 2 QAT VFs. -To be able to configure PCI ids for VFs we need to know their "naming convention". We need to connect to VM host and check PCI ids for VFs there. +In case **SRIOV NIC** or **SRIOV QAT** are chosen to be configured automatically on VM host as described above, PCI devices list for VMs will be set automatically, so PCI parameter can be left empty `pci: []`. Desired number of NIC or QAT SRIOV VFs for worker node VMs can be specified as follows: +``` +vms: + - type: "work" + name: "vm-work-1" + cpu_total: 16 + memory: 61440 + vxlan: 120 + pci: [] + nic_devices_count: 8 + qat_devices_count: 4 + - type: "work" + name: "vm-work-2" + cpu_total: 16 + memory: 61440 + vxlan: 120 + pci: [] + nic_devices_count: 8 + qat_devices_count: 4 +``` + +Otherwise if SRIOV NIC or SRIOV QAT were defined manually, to be able to configure PCI ids for VFs we need to know their "naming convention". We need to connect to VM host and check PCI ids for VFs there. **For SRIOV NIC VFs:** @@ -363,6 +394,28 @@ vms: - "3f:02.3" ``` +Example configuration if SRIOV QAT is configured automatically while SRIOV NIC is configured manually: + +``` +vms: + - type: "ctrl" + name: "vm-ctrl-1" + cpu_total: 8 + memory: 20480 + vxlan: 120 + - type: "work" + name: "vm-work-1" + cpu_total: 16 + memory: 61440 + vxlan: 120 + pci: + - "18:02.2" + - "18:02.3" + - "18:02.4" + - "18:02.5" + qat_devices_count: 16 +``` + Example expert mode configuration contains 2 VMs, 1 control and 1 work node. ``` @@ -403,6 +456,8 @@ There's also a set of configuration options that are applied in per-node manner The first set of variables configure assigned SRIOV NIC VFs and SRIOV QAT VFs inside VM. It requires setting `iommu_enabled` as `false`. +On worker nodes **SRIOV NIC** and **SRIOV QAT** can be also configured automatically regardless of whether they were configured automatically or manually on VM host. To do so leave `dataplane_interfaces` or `qat_devices` empty, devices will be discovered during the runtime. + **For SRIOV NIC** it requires passing names of interfaces together with additional NIC parameters. In below example `dataplane_interfaces` configuration contains 4 interfaces, where the first one starting with bus_info "06:00.0". The number in PCI id is sequentially increasing. `sriov_numvfs` must be "0" here. We can't create new VFs out of provided VF. `pf_driver` and `default_vf_driver` are not use at the moment. All interfaces are assigned to kernel mode iavf driver inside VM. The number of interfaces defined here in `dataplane_interfaces` have to be the same as number of NIC VFs assigned to this VM ! @@ -414,6 +469,14 @@ In our example configuration we've assigned 2 QAT VFs, so we have 2 devices defi This setting will add `vfio-pci.disable_denylist=1` kernel flags for Ubuntu/RHEL/Rocky specific versions, and as a result will reboot the target vm-work VM during deployment. +``` +dataplane_interfaces: [] + +qat_devices: [] +``` + +or + ``` dataplane_interfaces: - bus_info: "06:00.0" diff --git a/generate/playbook_templates/infra_playbook.j2 b/generate/playbook_templates/infra_playbook.j2 index 904451e5..9e578d81 100644 --- a/generate/playbook_templates/infra_playbook.j2 +++ b/generate/playbook_templates/infra_playbook.j2 @@ -1,5 +1,17 @@ --- # apply common cluster node configuration +- hosts: k8s_cluster + roles: + - role: cluster_defaults + tags: always + when: "'bm_host' in group_names" + - role: bootstrap/configure_proxy + tags: + - proxy + - intel-platform-sgx-setup + - intel-platform-qat-setup + when: "'bm_host' in group_names" + - hosts: k8s_cluster,vms,vm_host handlers: - name: reboot server @@ -7,6 +19,15 @@ when: - inventory_hostname != "localhost" pre_tasks: + - name: Tag target systems with RA version used + become: true + ansible.builtin.copy: + dest: "/etc/ra_deployment_info" + mode: '0644' + content: | + ra_version_commit: {{'{{' }} ra_git_commit {{ '}}' }} + ra_version_dirty: {{ '{{' }} ra_git_is_dirty {{ '}}' }} + when: ra_is_git | default(false) - name: End play for VM host and BM host meta: end_host when: @@ -16,6 +37,7 @@ roles: - role: cluster_defaults tags: always + - role: bootstrap/update_machine_id - role: bootstrap/configure_proxy tags: - proxy @@ -25,7 +47,7 @@ - role: bootstrap/run_dhclient_systemd_service_on_boot tags: run-dhclient-service-on-boot when: - - ansible_os_family != "RedHat" or ansible_distribution_version <= "9.0" + - ansible_os_family != "RedHat" or ansible_distribution_version is version('9.0', '<=') - enable_dhclient_systemd_service | default(false) - role: bootstrap/update_grub tags: @@ -68,6 +90,12 @@ # apply worker node kernel configuration - hosts: kube_node,vms,vm_host handlers: + - name: Update grub on RedHat systems + ansible.builtin.command: "grub2-mkconfig -o /boot/grub2/grub.cfg" + when: ansible_os_family == "RedHat" + - name: Update grub on Ubuntu systems + ansible.builtin.command: "update-grub" + when: ansible_distribution == "Ubuntu" - name: reboot server reboot: { reboot_timeout: 1200 } when: @@ -125,13 +153,6 @@ - local_volume_provisioner_enabled | default(false) | bool or minio_enabled | default(false) | bool or rook_ceph.enabled | default(false) | bool - - role: bootstrap/auto_detect_qat_devices - tags: - - auto-detect-qat-device - - intel-platform-qat-setup - when: - - configure_qat | default(false) | bool - - qat_devices | default([]) | length == 0 - role: bootstrap/set_sriov_kernel_flags tags: - setup-sriov @@ -148,26 +169,29 @@ - iommu_enabled | default(true) | bool - ((configure_dsa_devices is defined and configure_dsa_devices) or (configure_dlb_devices is defined and configure_dlb_devices)) and - ((ansible_distribution == "Ubuntu" and ansible_distribution_version == '20.04' and update_kernel) or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '21.04') or - (ansible_os_family == "RedHat" and ansible_distribution_version >= '8.6')) + ((ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==') and update_kernel) or + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('21.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.6', '>='))) - role: bootstrap/set_rdt_kernel_flags when: telegraf_enabled | default(false) | bool - role: bootstrap/set_intel_flexran_kernel_flags when: intel_flexran_enabled | default(false) | bool -{% if playbook_name in ['full_nfv', 'remote_fp', 'on_prem', 'on_prem_vss', 'build_your_own'] %} + - role: bootstrap/set_calico_vpp_interface_name + tags: calico-vpp + when: calico_vpp.enabled | default(false) | bool +{% if playbook_name in ['full_nfv', 'remote_fp', 'on_prem', 'on_prem_vss', 'build_your_own', 'base_video_analytics'] %} - role: bootstrap/configure_sst tags: sst when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= "20.04") or - (ansible_os_family == "RedHat" and ansible_distribution_version >= "8.3") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=')) - sst_bf_configuration_enabled | default(false) | bool or sst_cp_configuration_enabled | default(false) | bool or sst_tf_configuration_enabled | default(false) | bool or sst_pp_configuration_enabled | default(false) | bool - not vm_enabled or on_vms | default(false) | bool {% endif %} -{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'on_prem_aibox', 'regional_dc', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'on_prem_aibox', 'regional_dc', 'build_your_own', 'base_video_analytics'] %} - role: bootstrap/set_pcie_kernel_flags when: - configure_fpga | default(false) | bool @@ -211,7 +235,7 @@ environment: "{{ '{{' }} proxy_env | d({}) {{ '}}' }}" any_errors_fatal: true -{% if playbook_name in ['full_nfv', 'access', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'access', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} # install worker node qat software - hosts: kube_node,vms,vm_host handlers: @@ -240,9 +264,9 @@ tags: dlb-dp when: - configure_dlb_devices is defined and configure_dlb_devices - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == '20.04' and update_kernel) or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '21.04') or - (ansible_os_family == "RedHat" and ansible_distribution_version >= '8.6') + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==') and update_kernel) or + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('21.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.6', '>=')) - role: bootstrap/configure_dsa tags: dsa-dp when: configure_dsa_devices | default(false) @@ -281,6 +305,7 @@ tags: - determine-dataplane-interfaces - update-nic-firmware + - intel-ethernet-operator when: - dataplane_interfaces | default([]) | length > 0 - role: bootstrap/update_nic_drivers @@ -317,6 +342,7 @@ - determine-dataplane-interfaces - setup-sriov-nic - sriov-network-operator + - intel-ethernet-operator when: - dataplane_interfaces | default([]) | length > 0 - role: install_dpdk @@ -324,9 +350,9 @@ - dpdk - intel-platform-qat-setup when: - - ovs_dpdk_enabled | default(false) | bool or install_dpdk | default(true) | bool + - install_dpdk | default(true) | bool {% endif %} -{% if playbook_name in ['full_nfv', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: install_ddp_pkgs when: - install_ddp_packages | default(true) | bool @@ -338,13 +364,9 @@ - install_dpdk | default(true) | bool - iommu_enabled | default(true) | bool - not on_vms | default(false) | bool - - (kubernetes | default(true) | bool - and not container_runtime_only_deployment | default(false) | bool - and not sriov_network_operator_enabled | default(false) | bool - or (not kubernetes | default(true) | bool - and container_runtime_only_deployment | default(false) | bool)) + - not sriov_network_operator_enabled | default(false) | bool {% endif %} -{% if playbook_name in ['full_nfv', 'access', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'access', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: bootstrap/install_qatlibs tags: qatlibs when: @@ -367,7 +389,7 @@ when: - openssl_install | default(false) | bool {% endif %} -{% if playbook_name in ['full_nfv', 'access', 'on_prem', 'on_prem_vss', 'remote_fp', 'regional_dc', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'access', 'on_prem', 'on_prem_vss', 'remote_fp', 'regional_dc', 'build_your_own', 'base_video_analytics'] %} - role: bootstrap/configure_sgx tags: - sgx diff --git a/generate/playbook_templates/intel_playbook.j2 b/generate/playbook_templates/intel_playbook.j2 index 544897e6..8c1d406b 100644 --- a/generate/playbook_templates/intel_playbook.j2 +++ b/generate/playbook_templates/intel_playbook.j2 @@ -12,6 +12,11 @@ tags: remove-kubespray-host-dns-settings when: - remove_kubespray_host_dns_settings | default(false) | bool + - role: ingress_nginx_install + tags: ingress-nginx + when: + - ingress_enabled | default(false) + - inventory_hostname == groups['kube_control_plane'][0] # install sigstore policy controller ahead of others to allow namespace signing enforcement - role: sigstore_policy_controller tags: sigstore @@ -34,14 +39,20 @@ - nfd - intel-platform-qat-setup - intel-platform-sgx-setup + - gpu-dp + - dlb-dp + - dsa-dp when: nfd_enabled | default(false) | bool -{% if playbook_name in ['full_nfv', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: intel_cpu_controlplane tags: cpu-ctlplane when: intel_cpu_controlplane.enabled | default(false) | bool {% endif %} - role: operator_framework - tags: operator-framework + tags: + - operator-framework + - intel-ethernet-operator + - intel-sriov-fec-operator when: - intel_ethernet_operator_enabled | default(false) | bool or intel_sriov_fec_operator_enabled | default(false) | bool and not (intel_flexran_enabled | default(false) | bool and intel_flexran_type == "pod") @@ -65,12 +76,15 @@ - sriov_network_operator_enabled | default(false) | bool - not sriov_net_dp_enabled | default(false) | bool - not sriov_cni_enabled | default(false) | bool -{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: intel_dp_operator tags: - dp-operator - intel-platform-qat-setup - intel-platform-sgx-setup + - gpu-dp + - dlb-dp + - dsa-dp when: sgx_dp_enabled | default(false) or gpu_dp_enabled | default(false) or qat_dp_enabled | default(false) or @@ -83,26 +97,26 @@ when: - sgx_dp_enabled | default(false) {% endif %} -{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'build_your_own', 'base_video_analytics'] %} - role: gpu_dp_install tags: gpu-dp when: gpu_dp_enabled | default(false) | bool {% endif %} -{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: qat_dp_install tags: - qat-dp - intel-platform-qat-setup when: qat_dp_enabled | default(false) | bool {% endif %} -{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'regional_dc', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'regional_dc', 'build_your_own', 'base_video_analytics'] %} - role: dlb_dp_install tags: dlb-dp when: - dlb_dp_enabled is defined and dlb_dp_enabled | default(false) | bool - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == '20.04' and update_kernel) or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '21.04') or - (ansible_os_family == "RedHat" and ansible_distribution_version >= '9.0') + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==') and update_kernel) or + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('21.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('9.0', '>=')) - role: dsa_dp_install tags: dsa-dp when: dsa_dp_enabled is defined and dsa_dp_enabled | default(false) | bool @@ -115,8 +129,8 @@ kmra.pccs.enabled | default(false) | bool or kmra.apphsm.enabled | default(false) | bool or kmra.ctk_loadkey_demo.enabled | default(false) | bool - - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '20.04') - or (ansible_os_family == "RedHat" and ansible_distribution_version >= '8.3') + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '>=')) + or (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=')) - role: tcs_install tags: tcs when: @@ -126,19 +140,19 @@ when: - tac.enabled | default(false) | bool {% endif %} -{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'regional_dc', 'build_your_own'] %} +{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'regional_dc', 'build_your_own', 'base_video_analytics'] %} - role: kubernetes_power_manager tags: power-manager when: kubernetes_power_manager is defined and kubernetes_power_manager.enabled | default(false) | bool {% endif %} -{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: openssl_engine_install tags: - openssl-engine - intel-platform-qat-setup when: openssl_engine_enabled | default(false) | bool {% endif %} -{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: platform_aware_scheduling_install tags: platform-aware-scheduling when: tas_enabled | default(true) | bool or gas_enabled | default(true) | bool @@ -146,16 +160,14 @@ - role: prometheus_install tags: prometheus when: - - prometheus_enabled | default(false) | bool + - prometheus_stack_enabled | default(false) | bool - role: collectd_install tags: monitoring when: - collectd_enabled | default(false) | bool - - not (telegraf_enabled | default(false) | bool) - role: telegraf_install when: - telegraf_enabled | default(false) | bool - - not (collectd_enabled | default(false) | bool) tags: monitoring {% if playbook_name in ['full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'build_your_own'] %} - role: minio_install @@ -169,12 +181,6 @@ when: - rook_ceph.enabled | default(false) | bool {% endif %} -{% if playbook_name in ['on_prem', 'on_prem_vss', 'build_your_own'] %} - - role: intel_media_analytics - tags: intel-media-analytics - when: - - intel_media_analytics_enabled | default(false) | bool -{% endif %} {% if playbook_name in ['full_nfv', 'on_prem', 'regional_dc', 'build_your_own'] %} - role: ffmpeg_install tags: intel-ffmpeg @@ -200,13 +206,6 @@ tags: intel_csl_excat when: - intel_csl_excat_enabled | default(false) | bool -{% endif %} -{% if playbook_name in ['on_prem_aibox'] %} - - role: intel_base_container - tags: - - base_container - when: - - intel_base_container_enabled | default(false) | bool {% endif %} environment: - "{{ '{{' }} proxy_env | d({}) {{ '}}' }}" @@ -229,10 +228,22 @@ when: bond_cni_enabled | default(true) | bool tags: bond-cni {% endif %} -{% if playbook_name in ['full_nfv', 'remote_fp', 'build_your_own'] %} +{% if playbook_name in ['full_nfv', 'remote_fp', 'build_your_own', 'base_video_analytics'] %} - role: userspace_cni_install tags: userspace-cni - when: userspace_cni_enabled | default(true) | bool + when: userspace_cni_enabled | default(false) | bool +{% endif %} +{% if playbook_name in ['on_prem_aibox', 'on_prem_vss', 'on_prem'] %} + - role: intel_base_container + tags: + - base_container + when: + - intel_base_container_enabled | default(false) | bool + - role: intel_inband_manageability + tags: + - intel_inband_manageability + when: + - intel_inband_manageability_enabled | default(false) | bool {% endif %} environment: "{{ '{{' }} proxy_env | d({}) {{ '}}' }}" any_errors_fatal: true @@ -246,14 +257,18 @@ tags: net-attach-defs when: - kubernetes | default(false) | bool - - role: elasticsearch_install - tags: elasticsearch + - role: kubevirt_install + tags: kubevirt + when: + - kubevirt_enabled | default(false) + - role: eck_install + tags: eck when: - - elasticsearch_enabled | default(false) | bool + - eck_enabled | default(false) | bool - role: jaeger_install tags: jaeger when: - - jaeger_operator | default(false) | bool + - jaeger_enabled | default(false) | bool - role: opentelemetry_install when: - opentelemetry_enabled | default(false) | bool @@ -301,6 +316,12 @@ - intel_sriov_fec_operator_enabled | default(false) | bool - not (intel_flexran_enabled | default(false) | bool and intel_flexran_type == "pod") {% endif %} +{% if playbook_name in ['on_prem', 'on_prem_vss', 'build_your_own'] %} + - role: intel_media_analytics + tags: intel-media-analytics + when: + - intel_media_analytics_enabled | default(false) | bool +{% endif %} {% if playbook_name in ['access', 'full_nfv', 'on_prem', 'on_prem_vss', 'regional_dc', 'remote_fp', 'build_your_own'] %} - role: istio_service_mesh tags: istio-service-mesh diff --git a/generate/playbook_templates/main_playbook.j2 b/generate/playbook_templates/main_playbook.j2 index 24146ae8..3a4ec5fa 100644 --- a/generate/playbook_templates/main_playbook.j2 +++ b/generate/playbook_templates/main_playbook.j2 @@ -1,4 +1,6 @@ --- +- name: SRIOV NIC and SRIOV QAT auto-detection + import_playbook: autodetect.yml - name: preflight checks import_playbook: preflight.yml when: preflight_enabled | default(true) | bool diff --git a/generate/profiles_templates/cloud/profiles.yml b/generate/profiles_templates/cloud/profiles.yml index 1048f156..af4d0f28 100644 --- a/generate/profiles_templates/cloud/profiles.yml +++ b/generate/profiles_templates/cloud/profiles.yml @@ -46,6 +46,7 @@ # frequency_scaling # cstate # uncore_frequency +# time_of_day # - telemetry: # prometheus # collectd @@ -81,6 +82,9 @@ # - tadk # - cadvisor # - imtl +# - container_runtime_default - is in ['containerd', 'crio', 'docker'] +# - infra_power_manager +# - ingress_nginx --- access: name: access @@ -127,6 +131,7 @@ access: frequency_scaling: off cstate: off uncore_frequency: off + time_of_day: off telemetry: prometheus: on collectd: off @@ -163,6 +168,9 @@ access: adq_dp: off cadvisor: on imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off basic: name: basic @@ -182,6 +190,7 @@ basic: frequency_scaling: optional cstate: optional uncore_frequency: off + time_of_day: off telemetry: prometheus: on collectd: off @@ -206,6 +215,9 @@ basic: ai: optional cadvisor: on imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off full_nfv: name: full_nfv @@ -253,6 +265,7 @@ full_nfv: frequency_scaling: optional cstate: optional uncore_frequency: off + time_of_day: optional telemetry: prometheus: on collectd: off @@ -290,6 +303,9 @@ full_nfv: adq_dp: off cadvisor: on imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off on_prem: name: on_prem @@ -330,6 +346,7 @@ on_prem: frequency_scaling: off cstate: optional uncore_frequency: off + time_of_day: optional telemetry: prometheus: on collectd: off @@ -362,6 +379,9 @@ on_prem: ai: optional cadvisor: on imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off regional_dc: name: regional_dc @@ -396,6 +416,7 @@ regional_dc: frequency_scaling: optional cstate: optional uncore_frequency: off + time_of_day: optional telemetry: prometheus: on collectd: off @@ -428,6 +449,9 @@ regional_dc: ai: optional cadvisor: on imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off remote_fp: name: remote_fp @@ -471,6 +495,7 @@ remote_fp: frequency_scaling: off cstate: optional uncore_frequency: off + time_of_day: optional telemetry: prometheus: on collectd: off @@ -504,6 +529,9 @@ remote_fp: ai: optional cadvisor: optional imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off build_your_own: name: build_your_own @@ -549,6 +577,7 @@ build_your_own: frequency_scaling: off cstate: optional uncore_frequency: off + time_of_day: optional telemetry: prometheus: optional collectd: off @@ -586,3 +615,6 @@ build_your_own: adq_dp: off cadvisor: optional imtl: off + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: off diff --git a/generate/profiles_templates/common/group_vars.j2 b/generate/profiles_templates/common/group_vars.j2 index 1f6138b5..00844728 100644 --- a/generate/profiles_templates/common/group_vars.j2 +++ b/generate/profiles_templates/common/group_vars.j2 @@ -76,6 +76,7 @@ container_runtime_only_deployment: false # Tip: you can set this per host using host_vars update_all_packages: false + {% if arch in ['ultra'] %} update_kernel: true nda_kernel_path: "/tmp/linux-kernel-overlay" @@ -128,10 +129,10 @@ kubernetes: true # Kubernetes provisioner, Support: rke2(work with os ubuntu22.04 and containerd as container_runtime only), kubespray(default option) kube_provisioner: kubespray -kube_version: v1.27.1 # test placeholder: n version -#kube_version: v1.26.4 # test placeholder: n-1 version -#kube_version: v1.25.9 # test placeholder: n-2 version -rke2_version: v1.26.9+rke2r1 # test placeholder: n version +kube_version: v1.28.3 # test placeholder: n version +#kube_version: v1.27.7 # test placeholder: n-1 version +#kube_version: v1.26.10 # test placeholder: n-2 version +rke2_version: v1.28.3+rke2r1 # test placeholder: n version {% if kube_dashboard in ['on', 'optional'] %} # Kubernetes Dashboard @@ -154,15 +155,19 @@ audit_policy_custom_rules: "" # Kubernetes container runtime: docker, containerd, crio # When "crio" is set, please enable "crio_registries" section -{% if cloud_mode == 'on' %} -container_runtime: containerd -{% else %} -container_runtime: docker -{% endif %} +container_runtime: {{ container_runtime_default }} {% if rancher_manager in ['on', 'optional'] %} # Rancher Manager(supported on rke2 currently) rancher_manager_enabled: {% if rancher_manager == 'on' %}true{% else %}false{% endif %} + +{% endif %} + +{% if kubevirt in ['on', 'optional'] %} +# Kubevirt - virtual machine management add-on for Kubernetes +# More info here: https://github.com/kubevirt/kubevirt +kubevirt_enabled: {% if kubevirt == 'on' %}true{% else %}false{% endif %} + {% endif %} ######################## @@ -176,7 +181,7 @@ kube_proxy_metrics_bind_address: 127.0.0.1 kube_proxy_nodeport_addresses_cidr: 127.0.0.0/8 kube_pods_subnet: 10.244.0.0/16 -{% if name in ['regional_dc', 'full_nfv', 'access', 'build_your_own'] %} +{% if name in ['regional_dc', 'full_nfv', 'access', 'build_your_own', 'base_video_analytics'] %} {% set mask = 18 %} {% elif name == 'remote_fp' %} {% set mask = 19 %} @@ -211,18 +216,31 @@ kube_network_plugin_multus: {% if multus == 'on' and calico_vpp != 'on' %}true{% # Set on true if you want to enable the eBPF dataplane support calico_bpf_enabled: false +{% if ingress_nginx in ["on", "optional"] %} +# Kubernetes Ingress Controller to support Ingress resources +# Ingress can be accessed through the nodeport +ingress_enabled: {% if ingress_nginx == 'on' %}true{% else %}false{% endif +%} +# Uncomment if different nodeports are needed to be set +#ingress_nodeport_http: 30123 +#ingress_nodeport_https: 30124 + +{% endif %} {% if sriov_network_dp in ["on", "optional"] or network_userspace in ["on", "optional"] %} +{% if network_userspace in ['on', 'optional'] %} +# Userspace CNI +userspace_cni_enabled: {% if network_userspace == 'on' %}true{% else %}false{% endif %} + + +{% endif %} # Create reference net-attach-def objects example_net_attach_defs: # Values below should match host_vars CNI configuration {% if sriov_network_dp in ["on", "optional"] %} - sriov_net_dp: {% if sriov_network_dp == "on" %}true{% else %}false{% endif %} - + sriov_net_dp: {% if sriov_network_dp == "on" %}true{% else %}false{% endif +%} {% endif %} {% if network_userspace in ["on", "optional"] %} - userspace_ovs_dpdk: {% if network_userspace == "on" %}true{% else %}false{% endif %} - - userspace_vpp: false + userspace_ovs_dpdk: {% if ovs_dpdk == "on" %}true{% else %}false{% endif +%} + userspace_vpp: {% if vpp == "on" %}true{% else %}false{% endif +%} {% endif %} {% endif %} @@ -317,7 +335,7 @@ rook_ceph: enabled: {% if rook_ceph == 'on' %}true{% else %}false{% endif %} storage_class: "rook-cephfs" # Storage class name - storage_type: "cephfs" # Storage type for rook-ceph, supported values[cephfs, block, object]. + storage_type: "cephfs" # Storage type for rook-ceph, supported values[cephfs, nfs, block, object]. log_level: "DEBUG" # The logging level for the operator: ["ERROR", "WARNING", "INFO", "DEBUG"] allow_loop_devices: true # Allow using loop devices for osds in test clusters enable_nfs: true # Enable the CSI NFS drivers @@ -350,7 +368,6 @@ minio_deploy_test_mode: true # When true, use a file as # When false, use an actual NVME or SSD device when creating storage minio_build_image_locally: true # Build custom MinIO image locally minio_awsclient_pods_enabled: true # Run AWS client pods for MinIO Tenant service -minio_ingress_enabled: false # Enable MinIO tenant ingress {% endif %} #################### @@ -365,6 +382,12 @@ sriov_net_dp_namespace: kube-system # Whether to build and store image locally or use one from public external registry sriov_net_dp_build_image_locally: false # SR-IOV network device plugin configuration. +{% if nic == 'cvl' %} +nic_supported_pf_dev_ids: ["1592", "1593", "159b"] +{% else %} +nic_supported_pf_dev_ids: ["158a", "158b", "1572", "0d58", "1583"] +{% endif %} +nic_supported_vf_dev_ids: ["154c", "10ed", "1889"] # For more information on supported configuration refer to: https://github.com/intel/sriov-network-device-plugin#configurations {% if intel_flexran == 'on' %} # sriovdp_config_data for Intel FlexRAN is defined in the helm_values for the sriov_dp_install role @@ -412,23 +435,23 @@ sriovdp_config_data: | {% endif %} {% endif %} -{% if sgx_dp in ['on', 'optional'] and arch in ['icx', 'spr', 'emr'] or +{% if sgx_dp in ['on', 'optional'] and arch in ['icx', 'spr', 'emr', 'gnr'] or gpu_dp in ['on', 'optional'] or qat_dp in ['on', 'optional'] or - dsa_dp in ['on', 'optional'] and arch in ['spr', 'emr'] or - dlb_dp in ['on', 'optional'] and arch in ['spr', 'emr'] %} + dsa_dp in ['on', 'optional'] and arch in ['spr', 'emr', 'gnr'] or + dlb_dp in ['on', 'optional'] and arch in ['spr', 'emr', 'gnr'] %} # Intel Device Plugin Operator intel_dp_namespace: kube-system # Namespace will be applied for SGX DP, GPU DP and QAT DP {% endif %} -{% if dlb_dp in ['on', 'optional'] and arch in ['spr', 'emr'] %} +{% if dlb_dp in ['on', 'optional'] and arch in ['spr', 'emr', 'gnr'] %} # Intel Dynamic Load Balancing Device Plugin (Intel DLB DP) for Kubernetes dlb_dp_enabled: {% if dlb_dp == 'on' %}true{% else %}false{% endif %} # If true set configure_dlb_devices to true in host vars dlb_dp_build_image_locally: false dlb_dp_verbosity: 4 {% endif %} -{% if dsa_dp in ['on', 'optional'] and arch in ['spr', 'emr'] %} +{% if dsa_dp in ['on', 'optional'] and arch in ['spr', 'emr', 'gnr'] %} # Intel Data Streaming Accelerator Device Plugin (Intel DSA DP) for Kubernetes dsa_dp_enabled: {% if dsa_dp == 'on' %}true{% else %}false{% endif %} # If true set configure_dsa_devices to true in host vars dsa_dp_build_image_locally: false @@ -460,6 +483,7 @@ qat_supported_pf_dev_ids: - "18a0" - "4940" - "4942" + - "4944" qat_supported_vf_dev_ids: - "443" @@ -470,6 +494,7 @@ qat_supported_vf_dev_ids: - "18a1" - "4941" - "4943" + - "4945" {% endif %} {% if gpu_dp in ['on', 'optional'] %} @@ -482,13 +507,13 @@ gpu_dp_build_image_locally: false # Configuration-options # To fully discover the below settings usage, please refer to: https://github.com/intel/intel-device-plugins-for-kubernetes/tree/v0.24.0/cmd/gpu_plugin gpu_dp_shared_devices: 10 # Number of containers (min. 1) that can share the same GPU device -gpu_dp_monitor_resources: false # Enable monitoring all GPU resources on the node -gpu_dp_fractional_manager: false # Enable handling of fractional resources for multi-GPU nodes +gpu_dp_monitor_resources: {% if telemetry.intel_xpumanager == 'on' %}true{% else %}false{% endif %} # Enable monitoring all GPU resources on the node +gpu_dp_fractional_manager: {% if gas == 'on' %}true{% else %}false{% endif %} # Enable handling of fractional resources for multi-GPU nodes gpu_dp_prefered_allocation: 'none' # Available policies are: ['balanced', 'packed', 'none'] {% else %} gpu_dp_enabled: false {% endif %} -{% if sgx_dp in ['on', 'optional'] and arch in ['icx', 'spr', 'emr'] %} +{% if sgx_dp in ['on', 'optional'] and arch in ['icx', 'spr', 'emr', 'gnr'] %} # Intel SGX Device Plugin for Kubernetes sgx_dp_enabled: {% if sgx_dp == 'on' %}true{% else %}false{% endif %} @@ -533,6 +558,9 @@ sriov_network_operator_namespace: "sriov-network-operator" # Intel Ethernet Operator for Intel E810 Series network interface cards intel_ethernet_operator_enabled: {% if intel_ethernet_operator.enabled == 'on' and nic == 'cvl' %}true{% else %}false{% endif %} +# Set to true, if Operator should be build from source, needed for flow_confiration +intel_ethernet_operator_local_build: {% if intel_ethernet_operator.flow_config == 'on' and nic == 'cvl' %}true{% else %}false{% endif %} + # Use together with flow_configuration set in hostvars intel_ethernet_operator_flow_config_enabled: {% if intel_ethernet_operator.flow_config == 'on' and nic == 'cvl' %}true{% else %}false{% endif %} @@ -568,7 +596,7 @@ istio_service_mesh: # into the directory 'roles/istio_service_mesh/files/profiles/'. # 'custom-ca' profile name is reserved for usage by sgx_signer if sgx_signer option is enabled. # Any profile name provided will be overwritten in this case - profile: {% if istio_service_mesh.sgx_signer == 'on' and arch in ['icx'] %}custom-ca{% else %}default{% endif %} # Istio profile + profile: {% if istio_service_mesh.sgx_signer == 'on' and arch in ['icx', 'spr'] %}custom-ca{% else %}default{% endif %} # Istio profile intel_preview: enabled: {% if istio_service_mesh.intel_preview == 'on' %}true{% else %}false{% endif %} # Enable intel istio preview {% if istio_service_mesh.tcpip_bypass_ebpf in ['on', 'optional'] %} @@ -579,12 +607,12 @@ istio_service_mesh: tls_splicing: enabled: {% if istio_service_mesh.tls_splicing == 'on' %}true{% else %}false{% endif %} # Enable TLS splicing demo {% endif %} -{% if istio_service_mesh.sgx_signer in ['on', 'optional'] and arch in ['icx'] %} +{% if istio_service_mesh.sgx_signer in ['on', 'optional'] and arch in ['icx', 'spr'] %} sgx_signer: enabled: {% if istio_service_mesh.sgx_signer == 'on' %}true{% else %}false{% endif %} # Enable automated key management integration name: sgx-signer {% endif %} -{% if istio_service_mesh.intel_preview in ['on', 'optional'] and arch not in ['spr', 'emr']%} +{% if istio_service_mesh.intel_preview in ['on', 'optional'] and arch not in ['emr', 'gnr']%} # uncomment following section and enable intel_preview if sgx-mtls profile is selected {% if istio_service_mesh.intel_preview == 'optional' %}#{% endif %}set: # Istio intel preview with sgx-mtls {% if istio_service_mesh.intel_preview == 'optional' %}# {% endif %}- values.global.proxy.sgx.enabled=true # Istio intel preview with sgx-mtls @@ -605,61 +633,68 @@ linkerd_service_mesh: ## Telemetry & Observability ## ############################### -# Telemetry configuration. There are two options, Telegraf and Collectd, which are mutually exclusive. -# Default option is Telegraf. -# If Telegraf is enabled then the following parts of the stack need to be enabled as well: elasticsearch, -# jaeger, opentelemetry, kibana. Collectd has to be disabled in that case. -# If Collectd is enabled then all Telegraf stack components need to be disabled. -{% if telemetry.prometheus in ['on', 'optional'] %} -prometheus_enabled: {% if telemetry.prometheus == 'on'%}true{% else %}false{% endif %} - -{% endif %} {% if telemetry.collectd in ['on', 'optional'] %} -collectd_enabled: {% if telemetry.collectd == 'on'%}true{% else %}false{% endif %} +# Collectd is a daemon which collects system information and provides mechanisms to store and monitor the values in a variety of ways. +# If Collectd is enabled then Telegraf must be disabled. +collectd_enabled: {% if telemetry.collectd == 'on'%}true{% else %}false{% endif +%} +collectd_scrape_interval: 30 {% endif %} {% if telemetry.telegraf in ['on', 'optional'] %} -telegraf_enabled: {% if telemetry.telegraf == 'on'%}true{% else %}false{% endif %} +# Telegraf is an agent for collecting, processing, aggregating, and writing metrics. +# If Telegraf is enabled then Collectd must be disabled. +telegraf_enabled: {% if telemetry.telegraf == 'on'%}true{% else %}false{% endif +%} +telegraf_scrape_interval: 30 {% endif %} {% if telemetry.jaeger in ['on', 'optional'] %} -jaeger_operator: {% if telemetry.jaeger == 'on'%}true{% else %}false{% endif %} +# Jaeger is a distributed tracing platform that can be used for monitoring microservices-based distributed systems. +# Jaeger in RA sends tracing telemetry data to ElasticSearch, therefore elasticsearch must be enabled as well. +jaeger_enabled: {% if telemetry.jaeger == 'on'%}true{% else %}false{% endif +%} {% endif %} -{% if telemetry.opentelemetry in ['on', 'optional'] %} -opentelemetry_enabled: {% if telemetry.opentelemetry == 'on'%}true{% else %}false{% endif %} +{% if cadvisor in ['on', 'optional'] %} +# cAdvisor provides container users an understanding of the resource usage and performance characteristics of their running containers. +# It is a running daemon that collects, aggregates, processes, and exports information about running containers +cadvisor_enabled: {% if cadvisor == 'on' %}true{% else %}false{% endif +%} +# Enablement of scraping specific CPU perf events +cadvisor_sample_perf_events_enabled: false +cadvisor_pik_perf_events_enabled: false {% endif %} -{% if telemetry.elasticsearch in ['on', 'optional'] %} -elasticsearch_enabled: {% if telemetry.elasticsearch == 'on'%}true{% else %}false{% endif %} +{% if telemetry.opentelemetry in ['on', 'optional'] %} +# Opentelemetry collectors are used to scrap metrics from telegraf and cAdvisor and pass them to elasticsearch and prometheus +# If Opentelemetry is enabled, prometheus, jaeger and elasticsearch must be enabled as well. +opentelemetry_enabled: {% if telemetry.opentelemetry == 'on'%}true{% else %}false{% endif +%} {% endif %} -{% if telemetry.kibana in ['on', 'optional'] %} -kibana_enabled: {% if telemetry.kibana == 'on'%}true{% else %}false{% endif %} +{% if telemetry.prometheus in ['on', 'optional'] %} +# Prometheus stack includes prometheus, node_exporter and grafana deployments. +prometheus_stack_enabled: {% if telemetry.prometheus == 'on'%}true{% else %}false{% endif +%} {% endif %} -collectd_scrap_interval: 30 -telegraf_scrap_interval: 30 - -{% if cadvisor in ['on', 'optional'] %} -# cAdvisor -cadvisor_enabled: {% if cadvisor == 'on' %}true{% else %}false{% endif %} - -cadvisor_sample_perf_events_enabled: false -cadvisor_pik_perf_events_enabled: false +{% if telemetry.elasticsearch in ['on', 'optional'] %} +# Elasticsearch ECK is a distributed, RESTful search and analytics engine. +eck_enabled: {% if telemetry.elasticsearch == 'on'%}true{% else %}false{% endif +%} {% endif %} +{% if telemetry.kibana in ['on', 'optional'] %} +# Kibana is used to visualize data from elasticsearch. +# If Kibana is enabled, elasticsearch must be enabled as well. +kibana_enabled: {% if telemetry.kibana == 'on'%}true{% else %}false{% endif +%} +{% endif %} {% if telemetry.intel_xpumanager in ['on', 'optional'] %} # intel_xpumanager plugin collects information about Intel data center GPUs. -intel_xpumanager_enabled: {% if telemetry.intel_xpumanager == 'on'%}true{% else %}false{% endif %} +# If xpumanager is enabled, Prometheus stack must be enabled as well. +intel_xpumanager_enabled: {% if telemetry.intel_xpumanager == 'on'%}true{% else %}false{% endif +%} {% endif %} ###################### ## Power Management ## ###################### -{% if power.manager in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr'] %} +{% if power.manager in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} # Kubernetes Power Manager kubernetes_power_manager: enabled: {% if power.manager == 'on' %}true{% else %}false{% endif %} # Enable/Disable power manager @@ -668,24 +703,32 @@ kubernetes_power_manager: # - node1 # - node2 - build_image_locally: false # Build Power Manager image locally + build_image_locally: true # Build Power Manager image locally deploy_example_pods: true # Deploy example Pods that will utilize special resources global_shared_profile_enabled: true # Deploy custom Power Profile with user defined frequencies that can be applied to all power nodes # to make use of Shared Profile fill Shared Workload settings in host vars global_max_frequency: 1500 # Max frequency that will be applied for cores by Shared Workload global_min_frequency: 1000 # Min frequency that will be applied for cores by Shared Workload -{% if power.frequency_scaling in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr'] %} +{% if power.frequency_scaling in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} # !Please set up scaling driver in host_vars.yml file! # available governors: # "powersave" - Lowest frequency within the borders of min_frequency and max_frequency. # "performance" - Highest frequency within the borders of min_frequency and max_frequency. - # "userspace" - !ACPI ONLY! - Allow user space to set CPU frequency in scaling_setspeed attribute - # "schedutil" - !ACPI ONLY! - Uses data from CPU scheduler to set up frequency + # "userspace" - !CPUFREQ ONLY! - Allow user space to set CPU frequency in scaling_setspeed attribute + # "schedutil" - !CPUFREQ ONLY! - Uses data from CPU scheduler to set up frequency global_governor: "powersave" + +{% endif %} {% endif %} +{% if infra_power_manager in ['on', 'optional'] %} +# Warning: currently there's no support to deploy IPM. +# Setting this parameter to true will only apply required DPDK patches provided by IPM. +infrastructure_power_manager_enabled: {% if infra_power_manager == 'on' %}true{% else %}false{% endif %} + {% endif %} + ############################## ## Security & Certification ## ############################## @@ -734,7 +777,7 @@ kmra: {% endif %} {% if tcs in ['on', 'optional'] and - arch in ['icx'] %} + arch in ['icx', 'spr'] %} # Trusted Certificate Service deployment # https://github.com/intel/trusted-certificate-issuer tcs: @@ -743,7 +786,7 @@ tcs: {% endif %} {% if tac in ['on', 'optional'] and - arch in ['icx'] %} + arch in ['icx', 'spr'] %} # Trusted Attestation Controller deployment # https://github.com/intel/trusted-attestation-controller tac: @@ -820,10 +863,15 @@ always_pull_enabled: false # - http://mirror_ip:mirror_port #docker_insecure_registries: # - http://docker_insecure_registry_ip -#containerd_registries: -# "docker.io": -# - "https://registry-1.docker.io" -# - "http://mirror_ip:mirror_port" +#containerd_registries_mirrors: +# - prefix: docker.io +# mirrors: +# - host: https://registry-1.docker.io +# capabilities: ["pull", "resolve"] +# skip_verify: false +# - host: https://containerd_insecure_registry +# capabilities: ["pull", "resolve"] +# skip_verify: true #crio_registries: # - prefix: docker.io # insecure: false @@ -884,7 +932,8 @@ adq_dp: # In inventory.ini set "ip=" to IP address of CVL interface. # Additional requirements and details can be found in docs/calico_vpp.md calico_vpp: - enabled: {% if calico_vpp == 'on' %}true{% else %}false{% endif %} + enabled: {% if calico_vpp == 'on' %}true{% else %}false{% endif %} + interface_name: "enxcalicovpp001" {% endif %} {% if intel_eci and (intel_eci.values() | reject('eq', 'off')) | list | length() > 0 %} @@ -962,9 +1011,9 @@ mirror_urls: ffmpeg_install_enabled: {% if intel_ffmpeg == 'on' %}true{% else %}false{% endif %} ffmpeg_patches: - - url: "https://github.com/intel/cartwheel-ffmpeg/archive/refs/tags/2023q2.tar.gz" + - url: "https://github.com/intel/cartwheel-ffmpeg/archive/refs/tags/2023q3.tar.gz" type: "tar.gz" - sha256: "8c9a1b33bf1e034cd5ec0a9cf208cdb6e0846dae9d621040b83b1e5e31e59799" + sha256: "6d85524b99cc056b0823397d2b6f06e4375f61f59218b97b7b4713ba12739ae6" subdirectory: "patches/" patchset_enabled: true apply_all_patches: true @@ -973,4 +1022,21 @@ ffmpeg_patches: {% if base_container in ['on', 'optional'] %} intel_base_container_enabled: {% if base_container == 'on' %}true{% else %}false{% endif %} + +{% endif %} + +{% if name == 'on_prem_vss' %} +build_base_images: true +{% else %} +build_base_images: false +{% endif %} + +{% if inbm in ['on', 'optional'] %} +intel_inband_manageability_enabled: {% if inbm == 'on' %}true{% else %}false{% endif %} + +# Supported values for mode are 'inbc', 'cloud'. +# If local inbc option is chosen then provisioning will be performed automatically, otherwise provisioning should be run manually using provision-tc command. +# For more information please refer to +# https://github.com/intel/intel-inb-manageability/blob/develop/docs/In-Band%20Manageability%20Installation%20Guide%20Ubuntu.md +intel_inband_manageability_mode: 'inbc' {% endif %} diff --git a/generate/profiles_templates/common/host_vars.j2 b/generate/profiles_templates/common/host_vars.j2 index 0fee2afa..4eba79ee 100644 --- a/generate/profiles_templates/common/host_vars.j2 +++ b/generate/profiles_templates/common/host_vars.j2 @@ -69,7 +69,8 @@ cpusets: "4-11" install_dpdk: {% if dpdk == 'on' %}true{% else %}false{% endif %} # DPDK version (will be in action if install_dpdk: true) -dpdk_version: {% if (intel_flexran == 'on' or ovs_dpdk == 'on') %}"22.11.1"{% elif arch == "emr" %}"22.11.1"{% elif imtl == 'on'%}"23.03"{% else %}"23.07"{% endif %} # Note: dpdk_version is also dependent on ovs_dpdk when enabled (see preflight) +dpdk_version: {% if (intel_flexran == 'on' or ovs_dpdk == 'on' or infra_power_manager == 'on') %}"22.11.1"{% elif imtl == 'on'%}"23.03"{% else %}"23.11"{% endif %} + # Custom DPDK patches local path {% if intel_flexran == 'on' %}dpdk_local_patches_dir: "/tmp/flexran"{% else %}#dpdk_local_patches_dir: "/tmp/patches/dpdk"{% endif %} @@ -94,7 +95,15 @@ enable_dhclient_systemd_service: false ## Network Device Configuration ## ################################## +{% if on_vms != 'on' %} +default_ddp_profile: {% if nic == 'cvl' %}"ice_comms-1.3.45.0.pkg"{% else %}gtp.pkgo{% endif %} + +# default driver to use with dataplane_interfaces auto-configuration +dataplane_interface_default_vf_driver: "iavf" +{% endif %} + # dataplane interface configuration list +# leave empty for auto-configuration of NIC dataplane interfaces. dataplane_interfaces: [] #dataplane_interfaces: {% if on_vms == 'on' %} @@ -248,19 +257,15 @@ bond_cni_enabled: {% if bond_cni == 'on' %}true{% else %}false{% endif %} {% endif %} {% if network_userspace in ['on', 'optional'] %} -# Userspace CNI -userspace_cni_enabled: {% if network_userspace == 'on' %}true{% else %}false{% endif %} - - -ovs_dpdk_enabled: {% if ovs_dpdk == 'on' %}true{% else %}false{% endif %} # Should be enabled with Userspace CNI, when VPP is set to "false"; 1G hugepages required -ovs_version: "v3.2.0" # OVS version has to be compatible/functional with the DPDK version set by 'dpdk_version' -# CPU mask for OVS-DPDK PMD threads -ovs_dpdk_lcore_mask: 0x1 -# Hugepages allocated by OVS-DPDK per NUMA node in megabytes -ovs_dpdk_socket_mem: "256,0" # Example 1: "256,512" allocates 256MB from node 0 and 512MB from node 1 - # Example 2: "1024" allocates 1GB from node 0 on a single socket board, e.g. in a VM - -vpp_enabled: {% if vpp == 'on'%}true{% else %}false{% endif %} # Should be enabled with Userspace CNI, when ovs_dpdk is set to "false"; 2M hugepages required +# Userspace CNI related configuration, applied when userspace_cni_enabled set to 'true' in group_vars +userspace_cni: + vswitch: {% if vpp == 'on' %}vpp{% else %}ovs{% endif %} # Supported values: ovs, vpp + # OVS DPDK related configuration + ovs_version: "v3.2.1" # OVS version has to be compatible/functional with the DPDK version set by 'dpdk_version' + ovs_dpdk_lcore_mask: 0x1 # CPU mask for OVS-DPDK PMD threads + ovs_dpdk_socket_mem: "256,0" # Hugepages allocated by OVS-DPDK per NUMA node in megabytes + # Example 1: "256,512" allocates 256MB from node 0 and 512MB from node 1 + # Example 2: "1024" allocates 1GB from node 0 on a single socket board, e.g. in a VM {% endif %} ################## @@ -294,7 +299,7 @@ persistent_volumes: [] # persistentVolumeReclaimPolicy: "Retain" # Reclaim policy when a volume is released once it's bound, e.g., Retain/Recycle/Delete {% endif %} # mountPath: /mnt/disks/disk1 # Mount path of a volume, for local provisioner, it musts match /mnt/disks/* pattern -# device: /dev/nvme1n1 # Target storage device name when creating a volume. Only set it when storage_deploy_test_mode is false +# device: /dev/nvme0n1 # Target storage device name when creating a volume. Only set it when storage_deploy_test_mode is false # fsType: ext4 # file system types, [ext4, xfs] # - name: "mnt-data-2" @@ -304,7 +309,7 @@ persistent_volumes: [] # persistentVolumeReclaimPolicy: "Retain" {% endif %} # mountPath: /mnt/disks/disk2 -# device: /dev/nvme2n1 +# device: /dev/nvme1n1 # fsType: ext4 # - name: "mnt-data-3" @@ -314,7 +319,7 @@ persistent_volumes: [] # persistentVolumeReclaimPolicy: "Retain" {% endif %} # mountPath: /mnt/disks/disk3 -# device: /dev/nvme3n1 +# device: /dev/nvme2n1 # fsType: ext4 # - name: "mnt-data-4" @@ -324,7 +329,7 @@ persistent_volumes: [] # persistentVolumeReclaimPolicy: "Retain" {% endif %} # mountPath: /mnt/disks/disk4 -# device: /dev/nvme4n1 +# device: /dev/nvme3n1 # fsType: ext4 {% endif %} @@ -343,7 +348,7 @@ configure_fpga: {% if fpga == 'on' %}true{% else %}false{% endif %} {% endif %} -{% if sgx in ['on', 'optional'] and arch in ['icx', 'spr', 'emr'] %} +{% if sgx in ['on', 'optional'] and arch in ['icx', 'spr', 'emr', 'gnr'] %} # Intel Software Guard Extensions (SGX) configure_sgx: {% if sgx == 'on' %}true{% else %}false{% endif %} @@ -354,13 +359,13 @@ configure_sgx: {% if sgx == 'on' %}true{% else %}false{% endif %} configure_gpu: {% if gpu == 'on' %}true{% else %}false{% endif %} {% endif %} -{% if dlb in ['on', 'optional'] and arch in ['spr', 'emr'] %} +{% if dlb in ['on', 'optional'] and arch in ['spr', 'emr', 'gnr'] %} # Configure SIOV and Intel DLB devices - required for Intel DLB Device Plugin support configure_dlb_devices: {% if dlb == "on" %}true{% else %}false{% endif %} {% endif %} -{% if dsa in ['on', 'optional'] and arch in ['spr', 'emr'] %} +{% if dsa in ['on', 'optional'] and arch in ['spr', 'emr', 'gnr'] %} # Configure SIOV and Intel DSA devices - required for Intel DSA Device Plugin support configure_dsa_devices: {% if dsa == "on" %}true{% else %}false{% endif %} @@ -398,11 +403,26 @@ update_qat_drivers: {% if qat == "on" %}true{% else %}false{% endif %} {% if arch == "emr" and qat == "on" %} # EMR QAT driver version -emr_qat_driver_package: QAT20.L.1.1.11-00016.tar.gz +nda_qat_driver_package: QAT20.L.1.1.20-00030.tar.gz # SHA1 sum value for the driver package -emr_qat_driver_pkg_checksum: 73ba41e63bc83f9437a34131ff5e8fb09b4746ae +nda_qat_driver_pkg_checksum: c578a26b876823174441dfced9ac4fcaa41697ec # Path to store the EMR QAT package on the ansible host. -emr_qat_driver_staging_folder: /tmp/emr_qat/ +nda_qat_driver_folder: /tmp/nda_qat/ +{% endif %} + +{% if arch == "gnr" and qat == "on" %} +# GNR QAT driver version +#nda_qat_driver_package: QAT21.L.1.2.1-00016.tar.gz +#nda_qat_driver_package: QAT20.L.1.2.21-00014.tar.gz +nda_qat_driver_package: QAT20.L.1.2.20-00064.tar.gz +nda_qat_driver_package_rocky: gnr_sp_alpha_centos_qat21.l.1.2.1-00013.tar.gz +# SHA1 sum value for the driver package +#nda_qat_driver_pkg_checksum: 8edf08b0a7dd76479f0e3af0085790b8b5901baf +#nda_qat_driver_pkg_checksum: 81d4af17cd5e065da2ea91439fbdd271b1488812 +nda_qat_driver_pkg_checksum: 854dc526aa3ef85ef1a1e3a3c03b9a83b8363d52 +nda_qat_driver_pkg_checksum_rocky: 78d2bd9d2022ee662d33c179c3d876b8e733cd49 +# Path to store the GNR QAT package on the ansible host. +nda_qat_driver_folder: /tmp/nda_qat/ {% endif %} # Enabling the option will configure the QAT device. Must be enabled when qat is on. @@ -412,17 +432,17 @@ configure_qat: {% if qat == "on" %}true{% else %}false{% endif %} enabled_qat_service: "qat" disabled_qat_service: "qat_service" -{% if arch in ['spr', 'emr'] %} +{% if arch in ['spr', 'emr', 'gnr'] %} enable_qat_svm: false # Enable QAT Shared Virtual Memory (SVM). Only for OOT driver. {% endif %} # QAT parameters used by auto detection of qat devices -qat_sriov_numvfs_required: {% if on_vms == 'on' %}0{% else %}8{% endif %} -qat_vf_driver_required: {% if arch in ['spr', 'emr'] %}"4xxxvf"{% else %}"c6xxvf"{% endif %} +qat_vf_driver_required: {% if arch in ['spr', 'emr', 'gnr'] %}"4xxxvf"{% else %}"c6xxvf"{% endif %} # QAT interface configuration list +# Leave empty for auto-configuration of QAT devices. qat_devices: [] #qat_devices: {% if on_vms == 'on' %} @@ -435,7 +455,7 @@ qat_devices: [] # - qat_id: "0000:ab:00.0" # QAT device id one using DPDK compatible driver for VF devices to be used by vfio-pci kernel driver, replace as required # qat_sriov_numvfs: 12 # Number of VFs per PF to create - cannot exceed the maximum number of VFs available for the device. Set to 0 to not create any VFs. # # Note: Currently when trying to create fewer virtual functions than the maximum, the maximum number always gets created. -# qat_default_vf_driver: {% if arch in ['spr', 'emr'] %}"4xxxvf"{% else %}"c6xxvf"{% endif %} +# qat_default_vf_driver: {% if arch in ['spr', 'emr', 'gnr'] %}"4xxxvf"{% else %}"c6xxvf"{% endif %} # qat_vfs: # Used to configure a non-default VF driver for individual VFs # vf_00: "vfio-pci" # Configures the 1st VF with "vfio-pci" driver @@ -443,14 +463,14 @@ qat_devices: [] # - qat_id: "0000:xy:00.0" # qat_sriov_numvfs: 10 -# qat_default_vf_driver: {% if arch in ['spr', 'emr'] %}"4xxxvf"{% else %}"c6xxvf"{% endif %} +# qat_default_vf_driver: {% if arch in ['spr', 'emr', 'gnr'] %}"4xxxvf"{% else %}"c6xxvf"{% endif %} # qat_vfs: {} # To use the default VF driver for all VFs {% endif %} {% endif %} -{% if arch in ['spr', 'emr'] and tdx in ['on', 'optional']%} +{% if arch in ['spr', 'emr', 'gnr'] and tdx in ['on', 'optional']%} # EMR TDX configuration configure_tdx: {% if tdx == 'on' %}true{% else %}false{% endif %} @@ -510,13 +530,13 @@ exclude_collectd_plugins: [] ###################### ## Power Management ## ###################### -{% if power.manager in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr'] %} +{% if power.manager in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} # The performance profile is available for nodes that has CPU max MHz > 3500.0000 - use 'lscpu' command to see your node details # To use PowerProfiles in this list as sample pods on this node, please set 'deploy_example_pods' to true in group_vars power_profiles: [balance-performance] # Possible PowerProfiles are: [performance, balance-performance, balance-power] -{% if power.frequency_scaling in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr'] %} -frequency_scaling_driver: intel_pstate # Possible values: [intel_pstate, acpi_cpufreq] +{% if power.frequency_scaling in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} +frequency_scaling_driver: intel_pstate # Possible values: [intel_pstate, intel_cpufreq] {% endif %} # Power Manager Shared Profile/Workload settings. @@ -526,12 +546,12 @@ local_shared_profile: local_max_frequency: 2000 local_min_frequency: 1500 -{% if power.frequency_scaling in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr'] %} +{% if power.frequency_scaling in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} # available governors: # "powersave" - Lowest frequency within the borders of min_frequency and max_frequency. # "performance" - Highest frequency within the borders of min_frequency and max_frequency. - # "userspace" - !ACPI ONLY! - Allow user space to set CPU frequency in scaling_setspeed attribute - # "schedutil" - !ACPI ONLY! - Uses data from CPU scheduler to set up frequency + # "userspace" - !CPUFREQ ONLY! - Allow user space to set CPU frequency in scaling_setspeed attribute + # "schedutil" - !CPUFREQ ONLY! - Uses data from CPU scheduler to set up frequency local_governor: "powersave" {% endif %} @@ -561,7 +581,7 @@ uncore_frequency: # max: 2400000 {% endif %} -{% if power.cstate in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr'] %} +{% if power.cstate in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} cstates: enabled: {% if power.cstate == "on" %}true{% else %}false{% endif %} # Enable/Disable cstates @@ -578,10 +598,53 @@ cstates: # C1: true # C6: false {% endif %} +{% if power.time_of_day in ['on', 'optional'] and arch in ['icx', 'clx', 'spr', 'emr', 'gnr'] %} +# There is known limitation in upstream, where only one time of day is allowed in cluster. +# This should be fixed in the next release of KPM. +# When using multiple workers, only enable time of day for one of them. +time_of_day: + enabled: {% if power.time_of_day == "on" %}true{% else %}false{% endif %} + + time_zone: "Europe/Prague" # refer to 'IANA timezone database' for valid value + schedule: [] + # schedule examples: + # - time: "14:24" + # # powerProfile sets the profile for the shared pool + # powerProfile: balance-performance + # + # # this transitions exclusive pods matching a given label from one profile to another + # # please ensure that only pods to be used by power manager have this label + # pods: + # - labels: + # matchLabels: + # power: "true" + # target: balance-performance + # - labels: + # matchLabels: + # special: "false" + # target: balance-performance + # + # # cState field simply takes a cstate spec + # cState: + # sharedPoolCStates: + # C1: false + # C6: true + # + # - time: "14:26" + # powerProfile: performance + # cState: + # sharedPoolCStates: + # C1: true + # C6: false + # + # - time: "14:28" + # powerProfile: balance-power + reserved_cpus: [0, 1] +{% endif %} {% endif %} {% if sst in ['on', 'optional'] %} -{% if arch in ['icx', 'spr', 'emr'] %} +{% if arch in ['icx', 'spr', 'emr', 'gnr'] %} # Intel(R) SST-PP (perf-profile) configuration sst_pp_configuration_enabled: {% if sst == "on" %}true{% else %}false{% endif %} @@ -794,20 +857,22 @@ vms: #numa: 1 cpu_total: 16 #alloc_all: true - memory: 61440 + memory: 51200 vxlan: 120 + # leave empty for auto-configuration of NIC and QAT devices. + pci: [] {% if name not in ['build_your_own'] %} - pci: - - "18:02.2" # 18:xx.x are example VFs for networking - - "18:02.3" - - "18:02.4" - - "18:02.5" + #pci: + # - "18:02.2" # 18:xx.x are example VFs for networking + # - "18:02.3" + # - "18:02.4" + # - "18:02.5" + nic_devices_count: 4 # Used for auto-configuration of NICs. {% if qat == "on" %} - - "3d:01.1" # 3x:xx.x are example VFs for QAT - - "3f:01.1" + # - "3d:01.1" # 3x:xx.x are example VFs for QAT + # - "3f:01.1" + qat_devices_count: 2 # Used for auto-configuration of QAT. {% endif %} -{% else %} - pci: [] {% endif %} # - type: "work" {% if secondary_host == 'true' %} @@ -816,20 +881,22 @@ vms: # name: "vm-work-3" {% endif %} # cpu_total: 16 -# memory: 61440 +# memory: 51200 # vxlan: 120 +# # leave empty for auto-configuration of NIC and QAT devices. +# pci: [] {% if name not in ['build_your_own'] %} -# pci: -# - "18:02.0" # 18:xx.x are example VFs for networking -# - "18:02.1" -# - "18:02.6" -# - "18:02.7" +# #pci: +# # - "18:02.0" # 18:xx.x are example VFs for networking +# # - "18:02.1" +# # - "18:02.6" +# # - "18:02.7" +# nic_devices_count: 4 # Used for auto-configuration of NICs. {% if qat == "on" %} -# - "3d:01.2" # 3x:xx.x are example VFs for QAT -# - "3f:01.2" +# # - "3d:01.2" # 3x:xx.x are example VFs for QAT +# # - "3f:01.2" +# qat_devices_count: 2 # Used for auto-configuration of QAT. {% endif %} -{% else %} -# pci: [] {% endif %} # - type: "vm" {% if secondary_host == 'true' %} @@ -838,20 +905,22 @@ vms: # name: "vm-1" {% endif %} # cpu_total: 4 -# memory: 61440 +# memory: 51200 # vxlan: 120 +# # leave empty for auto-configuration of NIC and QAT devices. +# pci: [] {% if name not in ['build_your_own'] %} -# pci: -# - "18:02.0" # 18:xx.x are example VFs for networking -# - "18:02.1" -# - "18:02.6" -# - "18:02.7" +# #pci: +# # - "18:02.0" # 18:xx.x are example VFs for networking +# # - "18:02.1" +# # - "18:02.6" +# # - "18:02.7" +# nic_devices_count: 4 # Used for auto-configuration of NICs. {% if qat == "on" %} -# - "3d:01.2" # 3x:xx.x are example VFs for QAT -# - "3f:01.2" +# # - "3d:01.2" # 3x:xx.x are example VFs for QAT +# # - "3f:01.2" +# qat_devices_count: 2 # Used for auto-configuration of QAT. {% endif %} -{% else %} -# pci: [] {% endif %} diff --git a/generate/profiles_templates/k8s/profiles.yml b/generate/profiles_templates/k8s/profiles.yml index 74b7a1e6..ebbbf2c3 100644 --- a/generate/profiles_templates/k8s/profiles.yml +++ b/generate/profiles_templates/k8s/profiles.yml @@ -49,6 +49,7 @@ # frequency_scaling # cstate # uncore_frequency +# time_of_day # - telemetry: # prometheus # collectd @@ -109,6 +110,12 @@ # - calico_vpp # - ido # - imtl +# - base_container +# - inbm +# - container_runtime_default - is in ['containerd', 'crio', 'docker'] +# - kubevirt +# - infra_power_manager +# - ingress_nginx access: name: access @@ -126,7 +133,7 @@ access: bond_cni: off qat: optional qat_dp: optional - openssl: on + openssl: optional dsa: optional dsa_dp: optional dlb: optional @@ -155,6 +162,7 @@ access: frequency_scaling: off cstate: off uncore_frequency: off + time_of_day: off telemetry: prometheus: on collectd: optional @@ -188,7 +196,7 @@ access: flow_config: optional ddp_update: optional fw_update: optional - intel_sriov_fec_operator: on + intel_sriov_fec_operator: optional intel_oneapi: base: on ai: optional @@ -201,6 +209,11 @@ access: calico_vpp: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional basic: name: basic @@ -220,6 +233,7 @@ basic: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -254,6 +268,113 @@ basic: calico_vpp: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional + +base_video_analytics: + name: base_video_analytics + vm_mode: optional + on_vms: optional + nfd: on + kube_dashboard: optional + rancher_manager: optional + isolcpu: on + cpusets: off + intel_cpu_controlplane: optional + native_cpu_manager: on + sriov_operator: on + sriov_network_dp: optional + nic_drivers: on + bond_cni: off + qat: on + qat_dp: on + openssl: on + dsa: on + dsa_dp: on + dlb: off + dlb_dp: off + gpu: on + gpu_dp: on + sgx: on + sgx_dp: on + kmra: + sbx: off + oran: off + pccs: off + apphsm: off + ctk_demo: off + tcs: off + tac: off + tas: on + gas: on + ddp_legacy: on + network_userspace: on + vpp: optional + dpdk: on + ovs_dpdk: on + sst: optional + power: + manager: on + frequency_scaling: on + cstate: on + uncore_frequency: optional + time_of_day: optional + telemetry: + prometheus: on + collectd: off + telegraf: on + jaeger: on + opentelemetry: on + elasticsearch: on + kibana: off + intel_xpumanager: optional + istio_service_mesh: + enabled: off + tcpip_bypass_ebpf: off + tls_splicing: off + sgx_signer: off + intel_preview: off + linkerd_service_mesh: + enabled: off + wireguard: off + multus: on + firewall: optional + minio: off + lpvsp: on + rook_ceph: off + intel_media_analytics: off + intel_ffmpeg: off + cert_manager: on + registry: on + hugepages: on + intel_ethernet_operator: + enabled: on + flow_config: optional + ddp_update: on + fw_update: optional + intel_sriov_fec_operator: off + tadk: off + adq_dp: optional + intel_flexran: off + sigstore_policy_controller: off + intel_oneapi: + base: off + ai: off + cadvisor: on + fpga: off + tdx: optional + calico_vpp: optional + ido: off + imtl: on + base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: on + infra_power_manager: on + ingress_nginx: on full_nfv: name: full_nfv @@ -301,6 +422,7 @@ full_nfv: frequency_scaling: on cstate: on uncore_frequency: on + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -348,6 +470,11 @@ full_nfv: ido: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: on + infra_power_manager: optional + ingress_nginx: on on_prem: name: on_prem @@ -391,6 +518,7 @@ on_prem: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -434,7 +562,12 @@ on_prem: calico_vpp: optional ido: optional imtl: optional - base_container: off + base_container: optional + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional on_prem_vss: name: on_prem_vss @@ -449,13 +582,13 @@ on_prem_vss: sriov_network_dp: optional nic_drivers: on bond_cni: optional - qat: on - qat_dp: on + qat: optional + qat_dp: optional openssl: on dsa: on dsa_dp: on - dlb: on - dlb_dp: on + dlb: optional + dlb_dp: optional gpu: on gpu_dp: on sgx: on @@ -480,6 +613,7 @@ on_prem_vss: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: optional collectd: optional @@ -520,7 +654,12 @@ on_prem_vss: tdx: off calico_vpp: optional imtl: optional - base_container: off + base_container: on + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional on_prem_sw_defined_factory: name: on_prem_sw_defined_factory @@ -568,6 +707,7 @@ on_prem_sw_defined_factory: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: optional collectd: optional @@ -625,6 +765,11 @@ on_prem_sw_defined_factory: calico_vpp: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional on_prem_aibox: name: on_prem_aibox @@ -715,6 +860,11 @@ on_prem_aibox: calico_vpp: off imtl: optional base_container: on + inbm: optional + container_runtime_default: docker + kubevirt: off + infra_power_manager: off + ingress_nginx: off regional_dc: name: regional_dc @@ -749,6 +899,7 @@ regional_dc: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -793,6 +944,11 @@ regional_dc: ido: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional remote_fp: name: remote_fp @@ -834,6 +990,7 @@ remote_fp: frequency_scaling: on cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: on @@ -878,6 +1035,11 @@ remote_fp: ido: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional build_your_own: name: build_your_own @@ -925,6 +1087,7 @@ build_your_own: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: optional collectd: optional @@ -988,3 +1151,8 @@ build_your_own: ido: optional imtl: optional base_container: off + inbm: off + container_runtime_default: containerd + kubevirt: optional + infra_power_manager: optional + ingress_nginx: optional diff --git a/generate/profiles_templates/vm/vm_host_profiles.yml b/generate/profiles_templates/vm/vm_host_profiles.yml index a99d95d3..09498d0b 100644 --- a/generate/profiles_templates/vm/vm_host_profiles.yml +++ b/generate/profiles_templates/vm/vm_host_profiles.yml @@ -42,6 +42,7 @@ # frequency_scaling # cstate # uncore_frequency +# time_of_day # - sst # - telemetry: # prometheus @@ -77,6 +78,9 @@ # - cadvisor # - tdx # - imtl +# - container_runtime_default - is in ['containerd', 'crio', 'docker'] +# - infra_power_manager +# - ingress_nginx # sriov_operator is permanently disabled in VM mode # sriov_network_dp and dpdk are enabled for all VM mode profiles except build_your_own @@ -106,6 +110,7 @@ access: frequency_scaling: off cstate: off uncore_frequency: off + time_of_day: off telemetry: prometheus: on collectd: optional @@ -140,6 +145,9 @@ access: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional basic: name: basic @@ -158,6 +166,7 @@ basic: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -184,6 +193,9 @@ basic: cadvisor: on tdx: optional imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional full_nfv: name: full_nfv @@ -224,6 +236,7 @@ full_nfv: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional sst: optional telemetry: prometheus: on @@ -260,6 +273,9 @@ full_nfv: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: on on_prem: name: on_prem @@ -294,6 +310,7 @@ on_prem: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional sst: optional telemetry: prometheus: on @@ -329,6 +346,9 @@ on_prem: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional on_prem_sw_defined_factory: name: on_prem_sw_defined_factory @@ -406,6 +426,9 @@ on_prem_sw_defined_factory: cadvisor: optional tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional regional_dc: name: regional_dc @@ -439,6 +462,7 @@ regional_dc: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -473,6 +497,9 @@ regional_dc: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional remote_fp: name: remote_fp @@ -509,6 +536,7 @@ remote_fp: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional sst: optional telemetry: prometheus: on @@ -545,6 +573,9 @@ remote_fp: cadvisor: optional tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional build_your_own: name: build_your_own @@ -585,6 +616,7 @@ build_your_own: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional sst: optional telemetry: prometheus: optional @@ -621,3 +653,6 @@ build_your_own: cadvisor: optional tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional diff --git a/generate/profiles_templates/vm/vms_profiles.yml b/generate/profiles_templates/vm/vms_profiles.yml index a1873181..14dd53af 100644 --- a/generate/profiles_templates/vm/vms_profiles.yml +++ b/generate/profiles_templates/vm/vms_profiles.yml @@ -43,6 +43,7 @@ # frequency_scaling # cstate # uncore_frequency +# time_of_day # - telemetry: # prometheus # collectd @@ -77,6 +78,9 @@ # - cadvisor # - tdx # - imtl +# - container_runtime_default - is in ['containerd', 'docker', 'crio'] +# - infra_power_manager +# - ingress_nginx # sriov_operator is permanently disabled in VM mode # sriov_network_dp and dpdk are enabled for all VM mode profiles except build_your_own @@ -112,6 +116,7 @@ access: frequency_scaling: off cstate: off uncore_frequency: off + time_of_day: off telemetry: prometheus: on collectd: optional @@ -146,6 +151,9 @@ access: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional basic: name: basic @@ -164,6 +172,7 @@ basic: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -190,6 +199,9 @@ basic: cadvisor: on tdx: optional imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional full_nfv: name: full_nfv @@ -231,6 +243,7 @@ full_nfv: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -266,6 +279,9 @@ full_nfv: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: on on_prem: name: on_prem @@ -301,6 +317,7 @@ on_prem: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -335,6 +352,9 @@ on_prem: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional on_prem_sw_defined_factory: name: on_prem_sw_defined_factory @@ -412,6 +432,9 @@ on_prem_sw_defined_factory: cadvisor: optional tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional regional_dc: name: regional_dc @@ -445,6 +468,7 @@ regional_dc: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: optional @@ -479,6 +503,9 @@ regional_dc: cadvisor: on tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional remote_fp: name: remote_fp @@ -516,6 +543,7 @@ remote_fp: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: on collectd: on @@ -551,6 +579,9 @@ remote_fp: cadvisor: optional tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional build_your_own: name: build_your_own @@ -592,6 +623,7 @@ build_your_own: frequency_scaling: optional cstate: optional uncore_frequency: optional + time_of_day: optional telemetry: prometheus: optional collectd: optional @@ -627,3 +659,6 @@ build_your_own: cadvisor: optional tdx: off imtl: optional + container_runtime_default: containerd + infra_power_manager: optional + ingress_nginx: optional diff --git a/generate/render_util/common/cli.py b/generate/render_util/common/cli.py index 037ec998..24c79d0f 100644 --- a/generate/render_util/common/cli.py +++ b/generate/render_util/common/cli.py @@ -33,12 +33,12 @@ def parse_cli() -> argparse.Namespace: parser.add_argument('--host', type=str, default="host_vars.j2", help='host_vars template filepath') parser.add_argument('--profile', '-p', type=str, default='', - choices={'all_examples', 'access', 'basic', 'full_nfv', 'on_prem', + choices={'all_examples', 'access', 'basic', 'base_video_analytics', 'full_nfv', 'on_prem', 'on_prem_vss', 'on_prem_sw_defined_factory', 'regional_dc', 'remote_fp', 'build_your_own'}, # add new profiles here help='''profile name which files, required in deployment, will be copied to the project root directory''') - parser.add_argument('--arch', '-a', type=str, default='icx', - choices={"atom", "core", "skl", "clx", "icx", "spr", "emr", "ultra"}) # please add arch acronyms here + parser.add_argument('--arch', '-a', type=str, default='spr', + choices={"atom", "core", "skl", "clx", "icx", "spr", "emr", "gnr", "ultra"}) # please add arch acronyms here parser.add_argument('--nic', '-n', type=str, default='cvl', choices={"cvl", "fvl"}) # please add new NICs here parser.add_argument('--mode', type=str, default='k8s', choices={"k8s", "vm", "cloud"}, help='generate configuration files for selected mode') # please add new modes' name here diff --git a/generate/render_util/common/common.py b/generate/render_util/common/common.py index 00a4be0e..69d7ec2b 100644 --- a/generate/render_util/common/common.py +++ b/generate/render_util/common/common.py @@ -65,7 +65,6 @@ def add_nic_parameter(profiles: dict, args: argparse.Namespace) -> None: def add_mirrors_parameter(profiles: dict, args: argparse.Namespace) -> None: """Add mirrors information to profiles config""" for p in profiles.values(): - print(args.mirrors) p['mirrors'] = args.mirrors diff --git a/generate/render_util/renderers/playbook.py b/generate/render_util/renderers/playbook.py index 0574b4a9..f6663427 100644 --- a/generate/render_util/renderers/playbook.py +++ b/generate/render_util/renderers/playbook.py @@ -21,7 +21,7 @@ import os from render_util.common.common import render -_available_playbooks = [ 'access', 'basic', 'full_nfv', 'on_prem', 'on_prem_vss', +_available_playbooks = [ 'access', 'basic', 'base_video_analytics', 'full_nfv', 'on_prem', 'on_prem_vss', 'on_prem_sw_defined_factory', 'on_prem_aibox', 'remote_fp', 'regional_dc', 'build_your_own'] _playbook_dir = 'playbooks' diff --git a/library/cpupin.py b/library/cpupin.py index d2a19510..9b7a771f 100644 --- a/library/cpupin.py +++ b/library/cpupin.py @@ -10,12 +10,14 @@ short_description: cpupin module get the available resources from host server and do the CPU pinning for VMs -version_added: "1.0" +version_added: "1.1" description: cpupin module get the available resources from host server and do the CPU pinning for VMs. By default it selects CPUs from single NUMA node and do NUMA allignment for VMs With option alloc_all it selects all available CPUs from all NUMA nodes. It reserves configured amount of CPUs for host OS. + It preserve CPU allocations during VM redeployment and supports CPU reallocation + It returns unused CPUs back to the pool options: name: @@ -47,9 +49,14 @@ description: Do the actual cpu-pinning required: true type: boolean + host_name: + description: Name of the VM host machine + required: true + type: str -author: +authors: - Lumir Jasiok (lumirx.jasiok@intel.com) + - Jiri Prokes (jirix.prokes@intel.com) ''' EXAMPLES = r''' diff --git a/playbooks/autodetect.yml b/playbooks/autodetect.yml new file mode 100644 index 00000000..d67fc551 --- /dev/null +++ b/playbooks/autodetect.yml @@ -0,0 +1,45 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- hosts: kube_node,vm_host + any_errors_fatal: true + roles: + - role: bootstrap/auto_detect_nic_devices # noqa role-name[path] + tags: + - auto-detect-nic-device + - determine-dataplane-interfaces + - update-nic-firmware + - intel-ethernet-operator + - setup-sriov-nic + - sriov-network-operator + when: + - dataplane_interfaces is defined + - dataplane_interfaces | default([]) | length == 0 + - (update_nic_drivers is defined and update_nic_drivers) or + (install_ddp_packages is defined and install_ddp_packages) or + (sriov_cni_enabled is defined and sriov_cni_enabled) or + (sriov_network_operator_enabled is defined and sriov_network_operator_enabled) + + - role: bootstrap/auto_detect_qat_devices # noqa role-name[path] + tags: + - auto-detect-qat-device + - intel-platform-qat-setup + - setup-qat + - setup-sriov-qat + - qatlibs + when: + - configure_qat | default(false) | bool + - qat_devices | default([]) | length == 0 diff --git a/playbooks/dockerfiles.yml b/playbooks/dockerfiles.yml index e3141cd4..b89e4a8f 100644 --- a/playbooks/dockerfiles.yml +++ b/playbooks/dockerfiles.yml @@ -19,20 +19,40 @@ vars: dockerfiles_dir: "{{ (playbook_dir, '..', '.dockerfiles') | path_join }}" tasks: + - name: Remove dockerfiles directory content + ansible.builtin.file: + path: "{{ dockerfiles_dir }}" + state: absent + - name: Ensure dockerfiles directory exists ansible.builtin.file: path: "{{ dockerfiles_dir }}" state: directory - mode: 0755 - - name: Template media analytics Dockerfile + mode: '0755' + + - name: Generate media analytics Dockerfile ansible.builtin.include_role: name: intel_media_analytics tasks_from: template_dockerfile - - name: Generate media analytics Dockerfile + + - name: Generate vss base Dockerfiles + vars: + base_container_path: "{{ (dockerfiles_dir, 'media_analytics') | path_join }}" + base_container_dockerfile_path: "{{ (dockerfiles_dir, 'media_analytics') | path_join }}" + base_container_test_path: "{{ (dockerfiles_dir, 'media_analytics') | path_join }}" + base_container_sudo: false + profile_name: "on_prem_vss" + prc_network: false + gpu_type: "Flex" + ansible.builtin.include_role: + name: intel_base_container + tasks_from: main + + - name: Generate aibox Dockerfiles vars: - base_container_path: "{{ dockerfiles_dir }}" - base_container_dockerfile_path: "{{ dockerfiles_dir }}" - base_container_test_path: "{{ dockerfiles_dir }}" + base_container_path: "{{ (dockerfiles_dir, 'aibox') | path_join }}" + base_container_dockerfile_path: "{{ (dockerfiles_dir, 'aibox') | path_join }}" + base_container_test_path: "{{ (dockerfiles_dir, 'aibox') | path_join }}" base_container_sudo: false profile_name: "on_prem_aibox" prc_network: false diff --git a/playbooks/dyna_config.yml b/playbooks/dyna_config.yml index 207d6af1..b36ee583 100644 --- a/playbooks/dyna_config.yml +++ b/playbooks/dyna_config.yml @@ -37,7 +37,7 @@ vars: link_node_count: "{{ groups['kube_node'] | length }}" assert: - that: "{{ link_node_count }} == 2" + that: link_node_count == 2 msg: "Config dpdk link is a 2 nodes operation, but current nodes count is {{ link_node_count }}" tags: - dyna_config_dpdk @@ -81,6 +81,19 @@ # Execute gpu driver role on worker nodes - hosts: "{{node | default('kube_node')}}" + handlers: + - name: Update grub on RedHat systems + ansible.builtin.command: "grub2-mkconfig -o /boot/grub2/grub.cfg" + when: ansible_os_family == "RedHat" + changed_when: true + - name: Update grub on Ubuntu systems + ansible.builtin.command: "update-grub" + when: ansible_distribution == "Ubuntu" + changed_when: true + - name: reboot server + reboot: { reboot_timeout: 1200 } + when: + - inventory_hostname != "localhost" roles: - role: install_gpu_driver tags: diff --git a/playbooks/infra/prepare_ipu.yml b/playbooks/infra/prepare_ipu.yml index 308e7ab2..83634483 100644 --- a/playbooks/infra/prepare_ipu.yml +++ b/playbooks/infra/prepare_ipu.yml @@ -18,10 +18,10 @@ roles: - role: cluster_defaults tags: always - - role: bootstrap/configure_proxy + - role: bootstrap/configure_proxy # noqa role-name[path] tags: proxy - - role: ipu/common - - role: ipu/flash_ipu_ssd + - role: ipu/common # noqa role-name[path] + - role: ipu/flash_ipu_ssd # noqa role-name[path] when: - not ipu_1gbe_connected_to_linkp @@ -29,21 +29,21 @@ roles: - role: cluster_defaults tags: always - - role: bootstrap/configure_proxy + - role: bootstrap/configure_proxy # noqa role-name[path] tags: proxy - - role: ipu/common - - role: ipu/prepare_ipu_linkp - - role: ipu/flash_ipu_ssd + - role: ipu/common # noqa role-name[path] + - role: ipu/prepare_ipu_linkp # noqa role-name[path] + - role: ipu/flash_ipu_ssd # noqa role-name[path] when: - ipu_1gbe_connected_to_linkp - - role: ipu/flash_ipu_nvm + - role: ipu/flash_ipu_nvm # noqa role-name[path] - hosts: ipu_imc roles: - - role: ipu/common - - role: ipu/imc + - role: ipu/common # noqa role-name[path] + - role: ipu/imc # noqa role-name[path] - hosts: ipu_acc roles: - - role: ipu/common - - role: ipu/acc + - role: ipu/common # noqa role-name[path] + - role: ipu/acc # noqa role-name[path] diff --git a/playbooks/infra/prepare_vms.yml b/playbooks/infra/prepare_vms.yml index c479b6aa..ea55e7bb 100644 --- a/playbooks/infra/prepare_vms.yml +++ b/playbooks/infra/prepare_vms.yml @@ -14,22 +14,13 @@ ## limitations under the License. ## --- -# add those bm host in mixed cluster to a group -- hosts: k8s_cluster - tasks: - - name: Add hosts to inventory - bm_host - ansible.builtin.add_host: - hostname: "{{ inventory_hostname }}" - groups: "bm_host" - inventory_dir: '{{ inventory_dir }}' - - hosts: vm_host roles: - - role: vm/compile_libvirt + - role: vm/compile_libvirt # noqa role-name[path] when: - - ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04" + - ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==') - sgx_dp_enabled | default(false) - - role: vm/conf_libvirt + - role: vm/conf_libvirt # noqa role-name[path] environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true @@ -50,11 +41,14 @@ - hosts: vm_host gather_facts: false roles: - - role: vm/prepare_vm_cluster - - role: vm/manage_imgs - - role: vm/manage_bridges - - role: vm/manage_vms - - role: vm/prepare_cek + - role: vm/auto_configure_nic_devices # noqa role-name[path] + - role: vm/auto_configure_qat_devices # noqa role-name[path] + when: configure_qat | d(false) + - role: vm/prepare_vm_cluster # noqa role-name[path] + - role: vm/manage_imgs # noqa role-name[path] + - role: vm/manage_bridges # noqa role-name[path] + - role: vm/manage_vms # noqa role-name[path] + - role: vm/prepare_cek # noqa role-name[path] environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true @@ -62,30 +56,30 @@ gather_facts: false serial: 1 roles: - - role: vm/prepare_bastion_host_config + - role: vm/prepare_bastion_host_config # noqa role-name[path] environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true -- hosts: vm_host +- hosts: vm_host # noqa role-name[path] gather_facts: false roles: - - vm/prepare_cek_vxlan + - vm/prepare_cek_vxlan # noqa role-name[path] environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true # need handle those baremetal hosts need connect to VMs - hosts: k8s_cluster vars: - - vms: - - name: dummy - vxlan: "{{ hostvars[groups['vm_host'][0]]['dhcp'][0] }}" - - dhcp: [] + vms: + - name: dummy + vxlan: "{{ hostvars[groups['vm_host'][0]]['dhcp'][0] }}" + dhcp: [] gather_facts: false serial: 1 roles: - - role: vm/install_bm_libvirt - - role: vm/manage_bridges - - role: vm/prepare_bm_host_config_vxlan + - role: vm/install_bm_libvirt # noqa role-name[path] + - role: vm/manage_bridges # noqa role-name[path] + - role: vm/prepare_bm_host_config_vxlan # noqa role-name[path] environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true @@ -93,8 +87,8 @@ gather_facts: false serial: 1 roles: - - role: vm/prepare_bastion_host_config_vxlan - - role: vm/prepare_vm_inventory + - role: vm/prepare_bastion_host_config_vxlan # noqa role-name[path] + - role: vm/prepare_vm_inventory # noqa role-name[path] environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true diff --git a/playbooks/infra/redeploy_cleanup.yml b/playbooks/infra/redeploy_cleanup.yml index 528ef892..099224f3 100644 --- a/playbooks/infra/redeploy_cleanup.yml +++ b/playbooks/infra/redeploy_cleanup.yml @@ -42,7 +42,9 @@ - name: Check Group Vars assert: - that: "group_vars_details.stat.exists and group_vars_details.stat.isdir" + that: + - group_vars_details.stat.exists + - group_vars_details.stat.isdir msg: "File group_vars/all.yml does NOT exist. Must be created per Guide" - name: read Host Vars @@ -53,7 +55,9 @@ - name: check Host Vars assert: - that: "item.stat.exists and item.stat.isreg" + that: + - item.stat.exists + - item.stat.isreg msg: "File host_vars/{{ item.item }}.yml does NOT exist. Must be created per Guide" with_items: "{{ host_vars_details.results }}" diff --git a/playbooks/intel/eci_basic.yml b/playbooks/intel/eci_basic.yml index c1edecfa..cf31fe6b 100644 --- a/playbooks/intel/eci_basic.yml +++ b/playbooks/intel/eci_basic.yml @@ -22,8 +22,7 @@ tags: intel-eci when: - intel_eci_enabled | default(false) | bool - environment: - - "{{ proxy_env | d({}) }}" + environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true # This tasks is used to configure host and extract some information used for vm @@ -43,6 +42,5 @@ tags: intel-eci when: - intel_eci_enabled | default(false) | bool - environment: - - "{{ proxy_env | d({}) }}" + environment: "{{ proxy_env | d({}) }}" any_errors_fatal: true diff --git a/playbooks/intel/tdx.yml b/playbooks/intel/tdx.yml index 05e416e1..a3777939 100644 --- a/playbooks/intel/tdx.yml +++ b/playbooks/intel/tdx.yml @@ -25,20 +25,19 @@ roles: - role: cluster_defaults tags: always - - role: bootstrap/install_tdx_drivers + - role: bootstrap/install_tdx_drivers # noqa role-name[path] tags: tdx when: - configure_tdx | default(false) | bool - not on_vms | default(false) | bool - - role: bootstrap/set_tdx_kernel_flags + - role: bootstrap/set_tdx_kernel_flags # noqa role-name[path] tags: tdx when: - configure_tdx | default(false) | bool - not on_vms | default(false) | bool - - role: bootstrap/update_grub + - role: bootstrap/update_grub # noqa role-name[path] tags: - tdx - grub-update - intel-platform-qat-setup - environment: - - "{{ proxy_env | d({}) }}" + environment: "{{ proxy_env | d({}) }}" diff --git a/playbooks/k8s/k8s.yml b/playbooks/k8s/k8s.yml index 1cb822e4..b51a43df 100644 --- a/playbooks/k8s/k8s.yml +++ b/playbooks/k8s/k8s.yml @@ -68,7 +68,7 @@ set_fact: flannel_backend_type: >- {% if minio_enabled is defined and minio_enabled -%}vxlan{%- else -%} - {% if ansible_distribution_version >= '21.04' -%}host-gw{%- else -%}vxlan{% endif %}{% endif %} + {% if ansible_distribution_version is version('21.04', '>=') -%}host-gw{%- else -%}vxlan{% endif %}{% endif %} when: kube_network_plugin == "flannel" - name: Disable DNS stub listener when needed @@ -136,7 +136,6 @@ set_fact: container_manager: crio download_container: false - skip_downloads: false etcd_deployment_type: host when: container_runtime == "crio" @@ -144,13 +143,13 @@ - name: Deploy cluster via Kubespray ansible.builtin.import_playbook: "{% if scale | default(false) | bool %}kubernetes_sigs.kubespray.scale{% else %}kubernetes_sigs.kubespray.cluster{% endif %}" vars: - any_errors_fatal: true + kube_owner: root + kube_cert_group: root kubeadm_enabled: true helm_enabled: true krew_enabled: true multus_conf_file: /host/etc/cni/net.d/templates/00-multus.conf multus_image_tag: "v3.9.3-amd64" - nginx_image_tag: 1.24.0-alpine calico_node_livenessprobe_timeout: 15 calico_node_readinessprobe_timeout: 15 kube_proxy_mode: iptables @@ -176,7 +175,7 @@ kube_kubeadm_controller_extra_args: service-account-private-key-file: "{{ kube_cert_dir }}/sa.key" kubelet_cpu_manager_policy: "{% if native_cpu_manager_enabled | default(false) %}static{% else %}none{% endif %}" - kubelet_topoloy_manager_policy: "{{ topology_manager_policy | default('none') }}" + kubelet_topology_manager_policy: "{{ topology_manager_policy | default('none') }}" kubelet_topology_manager_scope: "{{ topology_manager_scope | default('container') }}" kubelet_config_extra_args: protectKernelDefaults: true @@ -236,6 +235,7 @@ register: flannel_endpoint - name: disable offloading features on flannel.1 command: ethtool --offload flannel.1 rx off tx off + changed_when: true become: yes when: - kube_network_plugin == "flannel" @@ -251,6 +251,12 @@ delay: 5 when: inventory_hostname == groups['kube_control_plane'][0] + - name: install calico vpp + ansible.builtin.include_role: + name: calico_vpp_install + tags: calico-vpp + when: calico_vpp.enabled | default(false) | bool + - name: allow traffic on wireguard interface block: - name: allow traffic on wireguard interface on Ubuntu @@ -361,9 +367,6 @@ when: - cert_manager_enabled | default(false) or registry_enable | default(false) - - role: calico_vpp_install - tags: calico-vpp - when: calico_vpp.enabled | default(false) | bool - role: container_registry tags: registry when: diff --git a/playbooks/k8s/kubelet-certificates.yml b/playbooks/k8s/kubelet-certificates.yml index 46f8655a..7396b601 100644 --- a/playbooks/k8s/kubelet-certificates.yml +++ b/playbooks/k8s/kubelet-certificates.yml @@ -61,12 +61,18 @@ changed_when: false when: inventory_hostname == groups['kube_control_plane'][0] - - name: create certs from CSRs - shell: - cmd: "{{ gopath.stdout }}/bin/cfssl gencert -ca {{ kube_cert_dir }}/ca.crt \ - -ca-key {{ kube_cert_dir }}/ca.key \ - -profile kubernetes node-{{ item }}.json | {{ gopath.stdout }}/bin/cfssljson -bare {{ item }}" + - name: Create certs from CSRs + ansible.builtin.shell: + cmd: >- + set -o pipefail && + {{ gopath.stdout }}/bin/cfssl gencert + -ca {{ kube_cert_dir }}/ca.crt + -ca-key {{ kube_cert_dir }}/ca.key + -profile kubernetes node-{{ item }}.json + | {{ gopath.stdout }}/bin/cfssljson -bare {{ item }} chdir: "{{ kube_csr_dir }}" + executable: /bin/bash + changed_when: true when: inventory_hostname == groups['kube_control_plane'][0] loop: "{{ groups['k8s_cluster'] }}" diff --git a/playbooks/k8s/post_deployment_hooks.yml b/playbooks/k8s/post_deployment_hooks.yml index 1841ba97..1b2fc998 100644 --- a/playbooks/k8s/post_deployment_hooks.yml +++ b/playbooks/k8s/post_deployment_hooks.yml @@ -20,73 +20,84 @@ - name: check if remote hooks dir present on ansible host delegate_to: localhost become: false - stat: + ansible.builtin.stat: path: "{{ hooks_remote }}" register: remote_hooks_dir - when: remote_hooks_dir.stat.present block: - name: delete hooks dir on remote - file: + ansible.builtin.file: state: absent path: "{{ hooks_remote }}" - name: copy remote hooks dir to remote become: true - copy: + ansible.builtin.copy: src: "{{ hooks_remote }}/" dest: "{{ hooks_remote }}" owner: root group: root mode: 0644 - name: read .sh files in remote hooks dir - find: + ansible.builtin.find: paths: "{{ hooks_remote }}" patterns: "*.sh" register: sh_remote_files_found - - debug: msg="{{ sh_remote_files_found.files }}" + - ansible.builtin.debug: + msg: "{{ sh_remote_files_found.files }}" - name: execute .sh scripts from hooks dir - command: "sh {{ item.path }}" + ansible.builtin.command: "sh {{ item.path }}" with_items: "{{ sh_remote_files_found.files }}" + changed_when: true when: sh_remote_files_found.files | length > 0 - name: read .py files in remote hooks dir - find: + ansible.builtin.find: paths: "{{ hooks_remote }}" patterns: "*.py" register: py_remote_files_found - - debug: msg="{{ py_remote_files_found.files }}" + - ansible.builtin.debug: + msg: "{{ py_remote_files_found.files }}" - name: execute .py scripts from hooks dir - command: "python3 {{ item.path }}" + ansible.builtin.command: "python3 {{ item.path }}" + changed_when: true with_items: "{{ py_remote_files_found.files }}" when: py_remote_files_found.files | length > 0 - name: execute .sh, .py scripts and ansible playbooks on localhost hosts: localhost tasks: - name: read .sh files in local hooks dir - find: + ansible.builtin.find: paths: "{{ hooks_local }}" patterns: "*.sh" register: sh_local_files_found - - debug: msg="{{ sh_local_files_found.files }}" + - ansible.builtin.debug: + msg: "{{ sh_local_files_found.files }}" - name: execute .sh scripts from local hooks dir - command: "sh {{ item.path }}" + ansible.builtin.command: "sh {{ item.path }}" + changed_when: true with_items: "{{ sh_local_files_found.files }}" when: sh_local_files_found.files | length > 0 - name: read .py files in local hooks dir - find: + ansible.builtin.find: paths: "{{ hooks_local }}" patterns: "*.py" register: py_local_files_found - - debug: msg="{{ py_local_files_found.files }}" + - ansible.builtin.debug: + msg: "{{ py_local_files_found.files }}" - name: execute .py scripts from hooks dir - command: "python3 {{ item.path }}" + ansible.builtin.command: "python3 {{ item.path }}" + changed_when: true with_items: "{{ py_local_files_found.files }}" when: py_local_files_found.files | length > 0 - name: read .yml and .yaml files in local hooks dir - find: + ansible.builtin.find: paths: "{{ hooks_local }}" patterns: "*.yaml,*.yml" register: playbooks_local_files_found - - debug: msg="{{ playbooks_local_files_found.files }}" + - ansible.builtin.debug: + msg: "{{ playbooks_local_files_found.files }}" - name: execute ansible playbooks from hooks dir - command: ansible-playbook -i ../../inventory.ini "{{ item.path }}" + ansible.builtin.command: + cmd: ansible-playbook -i ../../inventory.ini "{{ item.path }}" + changed_when: true with_items: "{{ playbooks_local_files_found.files }}" when: playbooks_local_files_found.files | length > 0 diff --git a/playbooks/k8s/templates/rke2_config.yaml.j2 b/playbooks/k8s/templates/rke2_config.yaml.j2 index da2dc3e6..21ad38b2 100644 --- a/playbooks/k8s/templates/rke2_config.yaml.j2 +++ b/playbooks/k8s/templates/rke2_config.yaml.j2 @@ -1,6 +1,7 @@ --- disable: rke2-ingress-nginx profile: cis-1.23 +disable-cloud-controller: true protect-kernel-defaults: true audit-policy-file: {{ rke2_audit_policy_file }} diff --git a/playbooks/preflight.yml b/playbooks/preflight.yml index 28010704..1b293906 100644 --- a/playbooks/preflight.yml +++ b/playbooks/preflight.yml @@ -121,7 +121,9 @@ - name: Check Group Vars ansible.builtin.assert: - that: "group_vars_details.stat.exists and group_vars_details.stat.isdir" + that: + - group_vars_details.stat.exists + - group_vars_details.stat.isdir msg: "File group_vars/all.yml does NOT exist. Must be created per Guide" - name: read Host Vars @@ -132,7 +134,9 @@ - name: check Host Vars ansible.builtin.assert: - that: "item.stat.exists and item.stat.isreg" + that: + - item.stat.exists + - item.stat.isreg msg: "File host_vars/{{ item.item }}.yml does NOT exist. Must be created per Guide" with_items: "{{ host_vars_details.results }}" @@ -144,7 +148,9 @@ - name: check VM Host Vars ansible.builtin.assert: - that: "item.stat.exists and item.stat.isreg" + that: + - item.stat.exists + - item.stat.isreg msg: "File host_vars/{{ item.item }}.yml does NOT exist. Must be created per Guide" with_items: "{{ vm_host_vars_details.results }}" @@ -165,6 +171,12 @@ register: check_ip_status when: vm_enabled and (not on_vms | default(false)) + - name: check container_runtime value + ansible.builtin.assert: + that: container_runtime is in ['docker', 'containerd', 'crio'] + msg: Incorrect container_runtime value !! + Make sure 'container_runtime' is in ['docker', 'containerd', 'crio'] + - name: handle the error for check passwordless access to VM hosts block: - name: check passwordless access to VM hosts @@ -228,14 +240,14 @@ - name: check scale variable value ansible.builtin.assert: - that: "{{ scale | bool }}" + that: scale | bool fail_msg: "scale variable must be set to one of the following values { yes, on, 1, true }, case insensitive" success_msg: "scale variable is set to {{ scale }} \ncluster scaling is enabled" when: scale is defined - name: check vm_recreate_existing variable value ansible.builtin.assert: - that: "not {{ vm_recreate_existing | bool }}" + that: not vm_recreate_existing | bool fail_msg: "vm_recreate_existing has to be false for cluster scaling case" success_msg: "vm_recreate_existing variable is set to {{ vm_recreate_existing }} for cluster scaling" when: @@ -253,8 +265,8 @@ kube_version_number: "{{ rke2_version | regex_search('(?<=v)\\d+\\.\\d+') }}" - name: assert kube_version_number ansible.builtin.assert: - that: "{{ kube_version_number is version('1.25', '<=') }}" - fail_msg: "Maximum k8s version for rancher manager is v1.25, current version is v{{ kube_version_number }}, please update group_vars" + that: kube_version_number is version('1.28', '<=') + fail_msg: "Maximum k8s version for rancher manager is v1.28, current version is v{{ kube_version_number }}, please update group_vars" when: rancher_manager_enabled is defined and rancher_manager_enabled ############################################## @@ -279,6 +291,10 @@ - "'vm_host' in group_names" - on_vms is defined and on_vms + - name: show mapping of inventory_hostname to real hostname + ansible.builtin.debug: + msg: "inventory_hostname = {{ inventory_hostname }} -> real hostname = {{ hostvars[inventory_hostname]['ansible_hostname'] }}" + - name: fail if deployment is VMRA and isolcpus is enabled ansible.builtin.assert: that: @@ -304,7 +320,9 @@ - name: check Host Vars for VMs ansible.builtin.assert: - that: "item.stat.exists and item.stat.isreg" + that: + - item.stat.exists + - item.stat.isreg msg: "File host_vars/{{ item.item.name }}.yml does NOT exist. Must be created per Guide" with_items: "{{ vm_host_vars_details.results }}" when: @@ -340,6 +358,7 @@ when: - vm_enabled and (not on_vms | default(false)) - inventory_hostname in groups['k8s_cluster'] + - hostvars[inventory_hostname]['ansible_virtualization_role']|default('') != 'guest' block: - name: check if mandatory parameter 'vxlan_physical_network' is present for the baremetal host assert: @@ -352,7 +371,7 @@ - name: check if mandatory parameter 'vxlan_physical_network' for the baremetal host has valid IP subnet assert: that: - - "vxlan_physical_network | ansible.utils.ipaddr('net')" + - vxlan_physical_network | ansible.utils.ipaddr('net') msg: | "vxlan_physical_network parameter for the host '{{ inventory_hostname }}' does not have valid IP subnet" "Current value is '{{ vxlan_physical_network }}'" @@ -360,7 +379,7 @@ - name: check if VXLAN physical network is available assert: - that: "hostvars[inventory_hostname]['ansible_all_ipv4_addresses'] | ansible.utils.ipaddr(vxlan_physical_network) | length > 0 " + that: hostvars[inventory_hostname]['ansible_all_ipv4_addresses'] | ansible.utils.ipaddr(vxlan_physical_network) | length > 0 msg: | "vxlan_physical_network '{{ vxlan_physical_network }}' is not available on host '{{ inventory_hostname }}'" "Please correct the configuration or update network setup to ensure proper connectivity with vm_hosts" @@ -376,7 +395,7 @@ - name: check if mandatory parameter 'vxlan_gw_ip' for the baremetal host has valid IP assert: that: - - "vxlan_gw_ip | ansible.utils.ipaddr('address')" + - vxlan_gw_ip | ansible.utils.ipaddr('address') msg: | "vxlan_gw_ip parameter for the host '{{ inventory_hostname }}' does not have valid IP" "Current value is '{{ vxlan_gw_ip }}'" @@ -386,12 +405,13 @@ - name: Check requirements to enable VMs multinode setup when: - vm_enabled and (not on_vms | default(false)) - - "(groups['vm_host'] | length > 1) or (groups['k8s_cluster'] | length > 0)" + - "(groups['vm_host'] | length > 1) or \ + ('k8s_cluster' in group_names and hostvars[inventory_hostname]['ansible_virtualization_role']|default('') != 'guest')" block: - name: check if mandatory parameter 'dhcp' is present on vm_host[0] assert: that: - - "hostvars[groups['vm_host'][0]]['dhcp'] | default([]) | length" + - hostvars[groups['vm_host'][0]]['dhcp'] | default([]) | length msg: | "vxlan dhcp must be defined on {{ groups['vm_host'][0] }} for multiple hosts deployment" run_once: true @@ -399,7 +419,7 @@ - name: check if mandatory parameter 'vxlan_physical_network' is present for the first vm_host ansible.builtin.assert: that: - - "hostvars[groups['vm_host'][0]]['vxlan_physical_network'] is defined" + - hostvars[groups['vm_host'][0]]['vxlan_physical_network'] is defined msg: | "vxlan_physical_network parameter is not defined for the first vm_host '{{ groups['vm_host'][0] }}'" "Please correct the configuration" @@ -408,7 +428,7 @@ - name: check if mandatory parameter 'vxlan_physical_network' for the first vm_host has valid IP subnet ansible.builtin.assert: that: - - "hostvars[groups['vm_host'][0]]['vxlan_physical_network'] | ansible.utils.ipaddr('net')" + - hostvars[groups['vm_host'][0]]['vxlan_physical_network'] | ansible.utils.ipaddr('net') msg: | "vxlan_physical_network parameter for the first vm_host '{{ groups['vm_host'][0] }}' does not have valid IP subnet" "Current value is '{{ hostvars[groups['vm_host'][0]]['vxlan_physical_network'] }}'" @@ -417,15 +437,17 @@ - name: check if VXLAN physical network is available ansible.builtin.assert: - that: "hostvars[inventory_hostname]['ansible_all_ipv4_addresses'] | ansible.utils.ipaddr(hostvars[groups['vm_host'][0]]['vxlan_physical_network']) | - length > 0 " + that: + hostvars[inventory_hostname]['ansible_all_ipv4_addresses'] | + ansible.utils.ipaddr(hostvars[groups['vm_host'][0]]['vxlan_physical_network']) | + length > 0 msg: | "vxlan_physical_network '{{ hostvars[groups['vm_host'][0]]['vxlan_physical_network'] }}' is not available on vm_host '{{ inventory_hostname }}'" "Please correct the configuration or update network setup to ensure proper connectivity between vm_hosts" - name: check for ip and ansible_host mismatch ansible.builtin.assert: - that: "ansible_host == ip" + that: ansible_host == ip msg: "Configuration mismatch detected between ansible_host={{ ansible_host }} and ip={{ ip }} on target '{{ inventory_hostname }}'" when: - ansible_host is defined @@ -438,7 +460,9 @@ msg: "Linux distribution on target is {{ ansible_distribution }} {{ ansible_distribution_version }} {{ ansible_distribution_release }}" - name: Check Linux Distro and Version ansible.builtin.assert: - that: "ansible_distribution in cek_supported_distros and ansible_distribution_version in cek_supported_distros_versions" + that: + - ansible_distribution in cek_supported_distros + - ansible_distribution_version in cek_supported_distros_versions msg: - Linux distribution {{ ansible_distribution }} {{ ansible_distribution_version }} on target '{{ inventory_hostname }}' is NOT supported - Must be one of {{ cek_supported_distros }} and version {{ cek_supported_distros_versions }} @@ -506,8 +530,8 @@ - name: check k8s version ansible.builtin.assert: - that: "{{ kube_version is version('v1.22', '>=') }}" - msg: "Minimum supported k8s version is 1.22, please update kube_version variable with correct version" + that: kube_version is version('v1.26', '>=') + msg: "Minimum supported k8s version is 1.26, please update kube_version variable with correct version" when: kubernetes and not container_runtime_only_deployment - name: check RKE2 requirements @@ -519,7 +543,7 @@ - name: assert that Multus is enabled in the config ansible.builtin.assert: that: - - "kube_network_plugin_multus" + - kube_network_plugin_multus | default(false) fail_msg: "SRIOV and the Userspace CNI plugin require Multus for a fully functional cluster deployment" when: sriov_net_dp_enabled is defined and sriov_net_dp_enabled or sriov_cni_enabled is defined and sriov_cni_enabled or @@ -528,7 +552,7 @@ - name: assert that SRIOV Network Operator/SRIOV Network DP are mutually exclusive ansible.builtin.assert: that: - - "not sriov_net_dp_enabled" + - not sriov_net_dp_enabled | default(false) fail_msg: - "SRIOV Network Operator is enabled, SRIOV Network DP/SRIOV CNI should be disabled !!" when: sriov_network_operator_enabled is defined and sriov_network_operator_enabled @@ -549,14 +573,15 @@ - name: check OS for cpusets based on cgroups v1 only ansible.builtin.assert: that: - - (ansible_distribution == 'Ubuntu' and ansible_distribution_version == '20.04') + - ansible_distribution == "Ubuntu" + - ansible_distribution_version is version('20.04', '==') msg: - Unsupported configuration. - CPUs isolation ('cpusets') can be only enabled on Ubuntu 20.04.x. - name: check CPUs isolation ansible.builtin.assert: - that: ( "{{ cpusets }}" | length > 0 ) + that: cpusets | length > 0 msg: - Incorrect configuration. Conflicting or improper values detected - CPUs isolation ('cpusets') must be set according to the example file for host_vars. Please correct the configuration @@ -592,14 +617,14 @@ - name: check cpusets Total ansible.builtin.assert: - that: "{{ cpusets_list | length }} <= ansible_processor_vcpus" + that: cpusets_list | length <= ansible_processor_vcpus msg: - Incorrect configuration pertaining cpusets. Conflicting or improper values detected - The number of cpusets {{ cpusets_list | length }}, exceeds total CPUs on target {{ ansible_processor_vcpus }}. Please correct the configuration - name: check cpusets IDs ansible.builtin.assert: - that: "item | int <= ansible_processor_vcpus" + that: item | int <= ansible_processor_vcpus msg: - Incorrect configuration pertaining cpusets. Conflicting or improper values detected - The CPU ID {{ item }} set for cpusets is NOT actually present on target. Please correct the configuration @@ -607,7 +632,7 @@ - name: check cpusets OS Reserved ansible.builtin.assert: - that: "item not in cpusets_list" + that: item not in cpusets_list msg: - Incorrect configuration pertaining cpusets. Conflicting or improper values detected - The CPU ID 0...{{ ansible_processor_count - 1 }} should NOT be set for cpusets. Please correct the configuration @@ -627,11 +652,10 @@ - ansible_processor_threads_per_core = {{ ansible_processor_threads_per_core }} - ansible_processor_vcpus = {{ ansible_processor_vcpus }} - CPUs Reserved for OS = 0...{{ ansible_processor_count - 1 }} - # - CPUs Reserved for OS = {{ lookup('sequence','0-{{ ansible_processor_count - 1 }}').split(',') }} # [E207] Nested jinja pattern - name: check CPUs isolation ansible.builtin.assert: - that: ( "{{ isolcpus }}" | length > 0 ) + that: isolcpus | length > 0 msg: - Incorrect configuration. Conflicting or improper values detected - CPUs isolation ('isolcpus') must be set according to the example file for host_vars. Please correct the configuration @@ -667,14 +691,14 @@ - name: check isolcpus Total ansible.builtin.assert: - that: "{{ isolcpus_list | length }} <= ansible_processor_vcpus" + that: isolcpus_list | length <= ansible_processor_vcpus msg: | Incorrect configuration pertaining isolcpus. Conflicting or improper values detected The number of isolcpus {{ isolcpus_list | length }}, exceeds total CPUs on target {{ ansible_processor_vcpus }}. Please correct the configuration - name: check isolcpus IDs ansible.builtin.assert: - that: "item | int <= ansible_processor_vcpus" + that: item | int <= ansible_processor_vcpus msg: - Incorrect configuration pertaining isolcpus. Conflicting or improper values detected - The CPU ID {{ item }} set for isolcpus is NOT actually present on target. Please correct the configuration @@ -682,7 +706,7 @@ - name: check isolcpus OS Reserved ansible.builtin.assert: - that: "item not in isolcpus_list" + that: item not in isolcpus_list msg: - Incorrect configuration pertaining isolcpus. Conflicting or improper values detected - The CPU ID 0...{{ ansible_processor_count - 1 }} should NOT be set for isolcpus. Please correct the configuration @@ -712,6 +736,23 @@ tags: - intel-media-analytics + - name: Check KubeVirt configuration + ansible.builtin.import_role: + name: kubevirt_install + tasks_from: preflight.yml + when: + - kubernetes | default(false) + - kubevirt_enabled | default(false) + tags: kubevirt + + - name: Check Infrstructure Power Manager configuration + ansible.builtin.import_role: + name: infrastructure_power_manager + tasks_from: preflight.yml + when: + - infrastructure_power_manager_enabled | default(false) + tags: infra-power-manager + #################################### # Prerequisites for Worker Node(s) # #################################### @@ -736,7 +777,8 @@ - name: assert that (SRIOV Network DP/SRIOV CNI) and (SRIOV Network Operator) are mutually exclusive ansible.builtin.assert: - that: "not (sriov_net_dp_enabled and (sriov_network_operator_enabled | default(false)))" + that: + - not (sriov_net_dp_enabled | default(false) and sriov_network_operator_enabled | default(false)) fail_msg: - "SRIOV Network DP/SRIOV CNI and SRIOV Network Operator are mutually exclusive. One must be disabled" when: @@ -745,7 +787,7 @@ - name: check DP Interfaces ansible.builtin.assert: - that: "dataplane_interfaces != []" + that: dataplane_interfaces != [] msg: "Dataplane (DP) interface(s) on target '{{ ansible_hostname }}' must be set in host_vars. Please correct the configuration" when: - profile_name != 'build_your_own' @@ -770,7 +812,7 @@ - name: read Physical NICs PCIIDs ansible.builtin.set_fact: - phy_nics_pciids: "{{ phy_nics_pciids + [ ansible_facts[item]['pciid'] ] }}" + phy_nics_pciids: "{{ phy_nics_pciids + [ ansible_facts[item]['pciid'] | regex_replace('0000:(.*)', '\\1') ] }}" with_items: "{{ ansible_interfaces }}" when: ansible_facts[item]['pciid'] is defined and ansible_facts[item]['type'] == "ether" @@ -778,7 +820,7 @@ - name: check DP Interfaces Bus Info ansible.builtin.assert: - that: ("{{ item.bus_info }}" in "{{ phy_nics_pciids }}") + that: item.bus_info in phy_nics_pciids msg: "Dataplane interface defined with PCI ID '{{ item.bus_info }}' does NOT exist on target. Please correct the configuration" with_items: "{{ dataplane_interfaces }}" when: dataplane_interfaces is defined and dataplane_interfaces != [] @@ -811,7 +853,7 @@ block: - name: Check first DP interface driver ansible.builtin.assert: - that: "dataplane_interfaces[0].pf_driver == ansible_facts[item]['module']" + that: dataplane_interfaces[0].pf_driver == ansible_facts[item]['module'] msg: >- "Dataplane interface '{{ dataplane_interfaces[0].bus_info }}' 'pf_driver' is set to '{{ dataplane_interfaces[0].pf_driver }}'. Please, set it to '{{ ansible_facts[item]['module'] }}'." @@ -822,7 +864,7 @@ - name: check if selected DDP package corresponds PF driver ice ansible.builtin.assert: - that: "{{ dataplane_interfaces[0].ddp_profile is regex('^ice_comms*') }}" + that: dataplane_interfaces[0].ddp_profile is regex('^ice_comms*') msg: "ddp_profile '{{ dataplane_interfaces[0].ddp_profile }}' doesn't correspond pf_driver '{{ dataplane_interfaces[0].pf_driver }}'" when: - dataplane_interfaces[0].ddp_profile is defined @@ -832,7 +874,7 @@ - name: check if selected DDP package corresponds PF driver i40e ansible.builtin.assert: - that: "{{ dataplane_interfaces[0].ddp_profile in ddp_profiles_allowed }}" + that: dataplane_interfaces[0].ddp_profile in ddp_profiles_allowed msg: "ddp_profile '{{ dataplane_interfaces[0].ddp_profile }}' doesn't correspond pf_driver '{{ dataplane_interfaces[0].pf_driver }}'" when: - dataplane_interfaces[0].ddp_profile is defined @@ -849,6 +891,19 @@ when: - dataplane_interfaces is defined and dataplane_interfaces | length > 0 + - name: check total SRIOV VFs required for NIC device auto detection on VMs + ansible.builtin.include_role: + name: bootstrap/auto_detect_nic_devices # noqa role-name[path] + tasks_from: preflight + when: + - vms is defined + - dataplane_interfaces is defined + - dataplane_interfaces | default([]) | length == 0 + - (update_nic_drivers is defined and update_nic_drivers) or + (install_ddp_packages is defined and install_ddp_packages) or + (sriov_cni_enabled is defined and sriov_cni_enabled) or + (sriov_network_operator_enabled is defined and sriov_network_operator_enabled) + - name: Print processor info ansible.builtin.debug: msg: "ansible_processor model: {{ ansible_processor[2] }}" @@ -859,6 +914,13 @@ cpu_id: "{{ ansible_processor[2] | regex_search('\\$?\\d\\d\\d\\d\\%?\\@?\\w?|\\d\\d/\\d\\w') }}" when: (not vm_enabled) or (vm_enabled and (not on_vms | default(false))) + - name: add dummy CPU ID for GNR in preflight + ansible.builtin.set_fact: + cpu_id: "9999" + when: + - (not vm_enabled) or (vm_enabled and (not on_vms | default(false))) + - cpu_id | length == 0 + - name: print CPU ID ansible.builtin.debug: msg: "CPU ID: {{ cpu_id }}" @@ -866,9 +928,9 @@ - name: check if CPU has confirmed support (preflight) ansible.builtin.assert: - that: "cpu_id in {{ lookup('ansible.builtin.vars', 'confirmed_' + configured_arch + '_cpus') }} \ - {% if configured_arch == 'clx' %} or cpu_id in {{ confirmed_clx_ncpus }} {% endif %} \ - or cpu_id in {{ unconfirmed_cpu_models }}" + that: cpu_id in lookup('ansible.builtin.vars', 'confirmed_' + configured_arch + '_cpus') + {% if configured_arch == 'clx' %} or cpu_id in confirmed_clx_ncpus {% endif %} + or cpu_id in unconfirmed_cpu_models fail_msg: "CPU model '{{ cpu_id }}' present on target is not in the confirmed CPUs list.\n To proceed, please add '{{ cpu_id }}' to the list of unconfirmed CPUs in variable 'unconfirmed_cpu_models' in group_vars.\n @@ -876,6 +938,7 @@ when: - (not vm_enabled) or (vm_enabled and (not on_vms | default(false))) - configured_arch not in ['atom', 'core', 'ultra'] + - hostvars[inventory_hostname]['ansible_virtualization_role']|default('') != 'guest' - name: check ubuntu pro token is not a placeholder ansible.builtin.assert: @@ -889,36 +952,71 @@ when: - rt_kernel_enabled | default(false) - - name: check EMR QAT drvier package + - name: check NDA QAT drvier package block: - - name: print debug message EMR QAT driver package + - name: print debug message NDA QAT driver package ansible.builtin.debug: - msg="Expecting file {{ (emr_qat_driver_staging_folder, emr_qat_driver_package) | path_join }} on local ansible host" - - name: probe for EMR QAT driver package + msg="Expecting file {{ (nda_qat_driver_folder, nda_qat_driver_package) | path_join }} for {{ configured_arch }} on local ansible host" + - name: probe for NDA QAT driver package delegate_to: localhost become: false ansible.builtin.stat: - path: "{{ (emr_qat_driver_staging_folder, emr_qat_driver_package) | path_join }}" + path: "{{ (nda_qat_driver_folder, nda_qat_driver_package) | path_join }}" checksum_algorithm: sha1 - register: emr_qat_driver - - name: print debug message for emr qat driver existence + register: nda_qat_driver + - name: print debug message for NDA QAT driver existence ansible.builtin.debug: - msg="{{ emr_qat_driver_package }} exists is {{ emr_qat_driver.stat.exists }}" - - name: check emr qat driver files exists + msg="{{ nda_qat_driver_package }} exists is {{ nda_qat_driver.stat.exists }}" + - name: check NDA QAT driver files exists ansible.builtin.assert: - that: "emr_qat_driver.stat.exists" - msg: - - Mandatory file {{ (emr_qat_driver_staging_folder, emr_qat_driver_package) | path_join }} does NOT exist on localhost. - - Please acquire the zip file and place it in the location indicated above in order to deploy EMR QAT. See docs/emr.md + that: nda_qat_driver.stat.exists + msg: | + Mandatory file {{ (nda_qat_driver_folder, nda_qat_driver_package) | path_join }} does NOT exist on localhost. + Please acquire the zip file and place it in the location indicated above in order to deploy NDA QAT. See docs/{{ configured_arch }}.md - name: check the qat driver package integrity ansible.builtin.assert: - that: "emr_qat_driver.stat.checksum == '{{ emr_qat_driver_pkg_checksum }}'" - msg: - - File {{ (emr_qat_driver_staging_folder, emr_qat_driver_package) | path_join }} on localhost is NOT the expected one. - - Please provide the correct file. See docs/emr.md + that: nda_qat_driver.stat.checksum == nda_qat_driver_pkg_checksum + msg: | + File {{ (nda_qat_driver_folder, nda_qat_driver_package) | path_join }} on localhost is NOT the expected one. + Real checksum -> {{ nda_qat_driver.stat.checksum }}, expected checksum > {{ nda_qat_driver_pkg_checksum }} + Please provide the correct file. See docs/{{ configured_arch }}.md when: - update_qat_drivers | default(false) - - configured_arch == "emr" + - configured_arch in ['emr', 'gnr'] + - configured_arch == 'gnr' and (ansible_distribution != "Rocky" or (ansible_distribution == "Rocky" and ansible_distribution_version != "9.2")) + + - name: check NDA QAT drvier package for Rocky 9.2 + block: + - name: print debug message NDA QAT driver package for Rocky + ansible.builtin.debug: + msg="Expecting file {{ (nda_qat_driver_folder, nda_qat_driver_package_rocky) | path_join }} for {{ configured_arch }} on local ansible host" + - name: probe for NDA QAT driver package for Rocky + delegate_to: localhost + become: false + ansible.builtin.stat: + path: "{{ (nda_qat_driver_folder, nda_qat_driver_package_rocky) | path_join }}" + checksum_algorithm: sha1 + register: nda_qat_driver + - name: print debug message for NDA QAT driver existence for Rocky + ansible.builtin.debug: + msg="{{ nda_qat_driver_package_rocky }} exists is {{ nda_qat_driver.stat.exists }}" + - name: check NDA QAT driver files exists for Rocky + ansible.builtin.assert: + that: nda_qat_driver.stat.exists + msg: | + Mandatory file {{ (nda_qat_driver_folder, nda_qat_driver_package_rocky) | path_join }} does NOT exist on localhost. + Please acquire the zip file and place it in the location indicated above in order to deploy NDA QAT. See docs/{{ configured_arch }}.md + - name: check the qat driver package integrity for Rocky + ansible.builtin.assert: + that: nda_qat_driver.stat.checksum == nda_qat_driver_pkg_checksum_rocky + msg: | + File {{ (nda_qat_driver_folder, nda_qat_driver_package_rocky) | path_join }} on localhost is NOT the expected one. + Real checksum -> {{ nda_qat_driver.stat.checksum }}, expected checksum > {{ nda_qat_driver_pkg_checksum_rocky }} + Please provide the correct file. See docs/{{ configured_arch }}.md + when: + - update_qat_drivers | default(false) + - configured_arch in ['gnr'] + - ansible_distribution == "Rocky" and ansible_distribution_version is version('9.2', '==') - name: check QAT SVM precheck ansible.builtin.assert: @@ -945,7 +1043,7 @@ - name: assert QAT PCIIDs ansible.builtin.assert: - that: "lspci_qat_host.rc == 0" + that: lspci_qat_host.rc == 0 fail_msg: "No QAT devices were found in system. Please configure properly the QAT PCIIDs in group_vars or disable this feature" success_msg: "QAT PCIIDs verification completed" when: @@ -969,7 +1067,7 @@ - name: check QAT Devices' Bus Info ansible.builtin.assert: - that: ("{{ item.qat_id }}" in """{{ lspci_qat.stdout }}""") + that: item.qat_id in lspci_qat.stdout msg: "QAT device defined with PCI ID '{{ item.qat_id }}' does NOT exist on target. Please correct the configuration" with_items: "{{ qat_devices }}" # STORY: "qat_sriov_numvfs should not exceed max supported (16) per each dev_ID" @@ -981,7 +1079,7 @@ - name: check QAT SRIOV VFs ansible.builtin.assert: - that: ({{ item.qat_sriov_numvfs }} <= 16) + that: item.qat_sriov_numvfs <= 16 msg: - Incorrect configuration pertaining QAT SRIOV. Conflicting or improper values detected - When SRIOV VFs are set for QAT, max value is 16 for each ID (max 48 total per card). Please correct the configuration @@ -990,13 +1088,12 @@ - update_qat_drivers | default(false) - qat_devices is defined and qat_devices != [] - - name: check QAT SRIOV VFs requirement for qat device auto detection - ansible.builtin.assert: - that: ({{ qat_sriov_numvfs_required }} <= 16) - msg: - - Incorrect configuration in qat_sriov_numvfs_required for requested number of QAT VTs - - When SRIOV VFs are set for QAT, max value is 16 for each ID (max 48 total per card). Please correct the configuration + - name: check total SRIOV VFs required for QAT device auto detection on VMs + ansible.builtin.include_role: + name: bootstrap/auto_detect_qat_devices # noqa role-name[path] + tasks_from: preflight when: + - vms is defined - update_qat_drivers | default(false) - qat_devices is defined and qat_devices == [] @@ -1008,6 +1105,13 @@ - gpu_dp_enabled is defined and gpu_dp_enabled - profile_name != "on_prem_aibox" + - name: Check Intel In-Band Manageability configuration + ansible.builtin.include_role: + name: intel_inband_manageability + tasks_from: preflight + when: + - intel_inband_manageability_enabled | d(false) + - name: Check GPU device plugin state ansible.builtin.assert: that: @@ -1022,7 +1126,9 @@ block: - name: FPGA OS precheck ansible.builtin.assert: - that: ((ansible_distribution == "Ubuntu") and (ansible_distribution_version == '22.04')) + that: + - ansible_distribution == "Ubuntu" + - ansible_distribution_version is version('22.04', '==') msg: >- Currently fpga is only supported on Ubuntu 22.04. @@ -1043,7 +1149,7 @@ - name: check fpga install scripts exists ansible.builtin.assert: - that: "fpga_register.stat.exists" + that: fpga_register.stat.exists msg: - "Mandatory file {{ (fpga_driver_staging_folder, fpga_install_script) | path_join }} does NOT exist on localhost." - "Please acquire the file from Intel Resource and Design Center and place it in the location indicated above in order to deploy fpga." @@ -1063,7 +1169,8 @@ ansible.builtin.assert: that: - gpu_dp_enabled - fail_msg: "gas installation requires gpu_dp_enabled set to true" + - gpu_dp_fractional_manager + fail_msg: "gas installation requires gpu_dp_enabled and gpu_dp_fractional_manager set to true" success_msg: "gas requirement verified" when: - gas_enabled | default(false) @@ -1128,11 +1235,11 @@ gpu_dp_enabled | default(false) or dsa_dp_enabled | default(false) or qat_dp_enabled | default(false) or - jaeger_operator | default(false) or + jaeger_enabled | default(false) or telegraf_enabled | default(false) or - prometheus_enabled | default(false) or + prometheus_stack_enabled | default(false) or opentelemetry_enabled | default(false) or - elasticsearch_enabled | default(false) or + eck_enabled | default(false) or linkerd_service_mesh.enabled | default(false) - kubernetes | default(false) | bool @@ -1207,88 +1314,12 @@ when: default_hugepage_size == "2M" when: hugepages_enabled | default(false) | bool -# STORY: "vpp/ovsdpdk require hugepage enabled and configured" - - ansible.builtin.debug: - msg: - - vpp_enabled = {{ vpp_enabled }} (host_vars) - - ansible_distribution = {{ ansible_distribution }} (basic facts) - - ansible_distribution_version = {{ ansible_distribution_version }} (basic facts) - - example_net_attach_defs = {{ example_net_attach_defs }} (group_vars/all.yml) - - userspace_ovs_dpdk = {{ example_net_attach_defs['userspace_ovs_dpdk'] }} (group_vars/all.yml) - - userspace_vpp = {{ example_net_attach_defs['userspace_vpp'] }} (group_vars/all.yml) - - sriov_net_dp = {{ example_net_attach_defs['sriov_net_dp'] }} (group_vars/all.yml) - - userspace_cni_enabled = {{ userspace_cni_enabled }} (host_vars) - - sriov_cni_enabled = {{ sriov_cni_enabled }} (host_vars) - - sriov_network_operator_enabled = {{ sriov_network_operator_enabled }} (host_vars) - - bond_cni_enabled = {{ bond_cni_enabled }} (host_vars) - - ovs_dpdk_enabled = {{ ovs_dpdk_enabled }} (host_vars) - - userspace_cni_enabled = {{ userspace_cni_enabled }} (host_vars) - - hugepages_enabled = {{ hugepages_enabled }} (host_vars) - - default_hugepage_size = {{ default_hugepage_size }} (host_vars) - - number_of_hugepages_1G = {{ number_of_hugepages_1G }} (host_vars) - - number_of_hugepages_2M = {{ number_of_hugepages_2M }} (host_vars) - when: vpp_enabled is defined and vpp_enabled # host_vars - - # W/A Disabled until userspace CNI compilation is fixed - - name: check OS for VPP compilation - ansible.builtin.fail: - msg: "VPP is temporarily not supported." - when: vpp_enabled is defined and vpp_enabled - - - name: check OS for VPP compilation - ansible.builtin.assert: - that: - - (ansible_distribution == 'Ubuntu' and ansible_distribution_version >= '22.04') - msg: "Unsupported configuration. VPP can be only enabled on Ubuntu >= 22.04" - when: vpp_enabled is defined and vpp_enabled - - - name: check OVS DPDK Dependencies - ansible.builtin.assert: - that: >- - ({{ ovs_dpdk_enabled }} and not {{ vpp_enabled }} and {{ hugepages_enabled }} and - "{{ default_hugepage_size }}" == "1G" and {{ number_of_hugepages_1G }} >= 0) - or {{ vpp_enabled }} - msg: - - Incorrect configuration pertaining OVS DPDK. Conflicting or improper values detected - - When OVS DPDK is enabled, VPP must be disabled and default_hugepage_size must be set to 1G according to host_vars example. - - Also check these conditions in group_vars, example_net_attach_defs['userspace_ovs_dpdk']=true, example_net_attach_defs['userspace_vpp']=false. - - Please correct the configuration - when: ovs_dpdk_enabled is defined and ovs_dpdk_enabled - - - name: check VPP Dependencies - ansible.builtin.assert: - that: >- - ({{ vpp_enabled }} and not {{ ovs_dpdk_enabled }} and {{ hugepages_enabled }} and - "{{ default_hugepage_size }}" == "2M" and {{ number_of_hugepages_2M }} >= 0) - or {{ ovs_dpdk_enabled }} - msg: - - Incorrect configuration pertaining VPP. Conflicting or improper values detected - - When VPP is enabled, OVS DPDK must be disabled and default_hugepage_size must be set to 2M according to host_vars example. - - Also check these conditions in group_vars, example_net_attach_defs['userspace_ovs_dpdk']=false, example_net_attach_defs['userspace_vpp']=true. - - Please correct the configuration. - when: vpp_enabled is defined and vpp_enabled - - -# STORY: "cnis require net-attach-defs to be enabled" - - name: check CNI Config - ansible.builtin.assert: - that: >- - ({{ userspace_cni_enabled }} and {{ ovs_dpdk_enabled }} and {{ example_net_attach_defs['userspace_ovs_dpdk'] }} and not {{ vpp_enabled }} and - not {{ example_net_attach_defs['userspace_vpp'] }} and {{ hugepages_enabled }} and - "{{ default_hugepage_size }}" == "1G" and {{ number_of_hugepages_1G }} >= 0) - or ({{ userspace_cni_enabled }} and not {{ ovs_dpdk_enabled }} and not {{ example_net_attach_defs['userspace_ovs_dpdk'] }} and {{ vpp_enabled }} - and {{ example_net_attach_defs['userspace_vpp'] }} and {{ hugepages_enabled }} and - "{{ default_hugepage_size }}" == "2M" and {{ number_of_hugepages_2M }} >= 0) - msg: - - Incorrect configuration pertaining CNI. Conflicting or improper values detected. - - When Userspace CNI is enabled, either OVS DPDK or VPP must be enabled, default_hugepage_size must also set under these options. - - 1. If VPP is enabled, ovs_dpdk_enabled=false, default_hugepage_size = 2M in host_vars; and - - example_net_attach_defs['userspace_ovs_dpdk']=false, example_net_attach_defs['userspace_vpp']=true in group_vars. - - 2. If OVS DPDK is enabled, default_hugepage_size = 1G in host_vars; and - - vpp_enabled=false, example_net_attach_defs['userspace_ovs_dpdk']=true, example_net_attach_defs['userspace_vpp']=false in group_vars. - - Please correct the configuration. - when: userspace_cni_enabled is defined and userspace_cni_enabled - + - name: Check Userspace CNI preflight + ansible.builtin.import_role: + name: userspace_cni_install + tasks_from: preflight.yml + tags: userspace-cni + when: userspace_cni_enabled | default(false) # STORY: "If SST enabled, confirm minimum kernel or kernel_update specified" - name: check platform before SST-PP verification @@ -1300,8 +1331,8 @@ - name: check Intel(R) SST-PP (feature perf-profile) requirements ansible.builtin.assert: that: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= "20.04") or - (ansible_os_family == "RedHat" and ansible_distribution_version >= "8.3") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=')) - turbo_boost_enabled fail_msg: - "Playbook terminated cause of failure might to two reasons: Firstly, SST-PP is only supported in RedHat / Rocky >= 8.3 or Ubuntu >=20.04" @@ -1413,23 +1444,11 @@ - configured_arch not in ['atom', 'core', 'ultra'] - profile_name not in ['on_prem_sw_defined_factory'] -# STORY: "collectd and telegraf are mutually exclusive" - - name: fail if collectd and telegraf are both enabled - ansible.builtin.assert: - that: >- - (({{ collectd_enabled | bool }}) and (not ({{ telegraf_enabled | bool }}))) - or (not {{ collectd_enabled | bool }} and {{ telegraf_enabled | bool }}) - or (not {{ collectd_enabled | bool }} and (not ({{ telegraf_enabled | bool }}))) - msg: "CollectD and Telegraf are mutually exclusive, please enable only one." - when: - - collectd_enabled is defined - - telegraf_enabled is defined - # STORY: "supported k8s versions require istio in >= 1.10" - name: fail if istio version is not compatible with current k8s version ansible.builtin.assert: that: - - "{{ istio_service_mesh.version is version('1.10', '>=') }}" + - istio_service_mesh.version is version('1.10', '>=') msg: | "Selected Istio service mesh version: '{{ istio_service_mesh.version }}' is not compatible with selected k8s version: '{{ kube_version }}'" "Please, refer to the compatibility table at https://istio.io/latest/docs/releases/supported-releases/" @@ -1443,122 +1462,86 @@ - name: check if sgx dp and cert manager are enabled when TCS enabled ansible.builtin.assert: that: - - "{{ sgx_dp_enabled | default(false) }}" - - "{{ cert_manager_enabled | default(false) }}" + - sgx_dp_enabled | default(false) + - cert_manager_enabled | default(false) msg: "sgx_dp and cert manager should be enabled in order to have TCS functional." when: - tcs.enabled | default(false) - - configured_arch in ['icx'] + - configured_arch in ['icx', 'spr'] # STORY: "TAC depends on TCS" - name: check if TCS is enabled when TAC enabled ansible.builtin.assert: that: - - "{{ tcs.enabled | default(false) }}" - - "{{ kmra.apphsm.enabled | default(false) }}" - - "{{ kmra.pccs.enabled | default(false) }}" + - tcs.enabled | default(false) + - kmra.apphsm.enabled | default(false) + - kmra.pccs.enabled | default(false) msg: "TCS, KMRA AppHSM and PCCS should be enabled in order to have TAC functional." when: - tac.enabled | default(false) - - configured_arch in ['icx'] + - configured_arch in ['icx', 'spr'] -# STORY: "istio_service_mesh.sgx_signer' option is available only for icx platforms" +# STORY: "istio_service_mesh.sgx_signer' option is available only for icx, spr platforms" - name: particular service mesh options are available only for specific platforms ansible.builtin.assert: that: - - "{{ not istio_service_mesh.sgx_signer.enabled | default(false) }}" + - not istio_service_mesh.sgx_signer.enabled | default(false) msg: "'istio_service_mesh.sgx_signer' option is not available for the configured platform architecture." when: - istio_service_mesh.enabled | default(false) - - configured_arch not in ['icx'] + - configured_arch not in ['icx', 'spr'] # STORY: "istio_service_mesh.sgx_signer' option must be true when profile is ca_custom" - name: particular service mesh options must be set together ansible.builtin.assert: that: - - "{{ istio_service_mesh.sgx_signer.enabled | default(false) }}" + - istio_service_mesh.sgx_signer.enabled | default(false) msg: "'istio_service_mesh.sgx_signer' must be enabled for custom-ca profile." when: - istio_service_mesh is defined - istio_service_mesh.enabled | default(false) - istio_service_mesh.profile == 'custom-ca' | default('default') - - configured_arch in ['icx'] + - configured_arch in ['icx', 'spr'] -# STORY: TCS is available only for icx platforms" +# STORY: TCS is available only for icx, spr platforms" - name: TCS is available only for specific platforms ansible.builtin.assert: that: - - "{{ not tcs.enabled | default(false) }}" + - not tcs.enabled | default(false) msg: "TCS is not available for the configured platform architecture." when: - - configured_arch not in ['icx'] + - configured_arch not in ['icx', 'spr'] - name: Make sure istio and linkerd are not enabled at the same time ansible.builtin.assert: that: - - "{{ not linkerd_service_mesh.enabled | default (false) }}" + - not linkerd_service_mesh.enabled | default (false) fail_msg: "You should not have enabled Istio and LinkerD service mesh on at the same time. Please choose and enable only one service mesh." when: - istio_service_mesh.enabled | default(false) -# STORY: TAC is available only for icx platforms" +# STORY: TAC is available only for icx, spr platforms" - name: TAC is available only for specific platforms ansible.builtin.assert: that: - - "{{ not tac.enabled | default(false) }}" + - not tac.enabled | default(false) msg: "TAC is not available for the configured platform architecture." when: - - configured_arch not in ['icx'] + - configured_arch not in ['icx', 'spr'] # STORY: "istio_service_mesh.sgx_signer' option depends on KMRA AppHSM, KMRA PCCS, TCS, TAC. - name: check if KMRA Apps, TCS and TAC are enabled when service mesh sgx_signer option is enabled ansible.builtin.assert: that: - - "{{ kmra.apphsm.enabled | default(false) }}" - - "{{ kmra.pccs.enabled | default(false) }}" - - "{{ tcs.enabled | default(false ) }}" - - "{{ tac.enabled | default(false ) }}" + - kmra.apphsm.enabled | default(false) + - kmra.pccs.enabled | default(false) + - tcs.enabled | default(false ) + - tac.enabled | default(false ) msg: "In order to use service mesh sgx-signer option, please, enable KMRA AppHSM, KMRA PCCS, TCS, TAC." when: - istio_service_mesh.sgx_signer.enabled | default(false) - - configured_arch in ['icx'] - -# STORY: TEMPORARY: "ovs dpdk version requirements" - - ansible.builtin.debug: - msg: - - install_dpdk = {{ install_dpdk }} (host_vars) - - dpdk_version = {{ dpdk_version }} (host_vars) - - ovs_dpdk_enabled = {{ ovs_dpdk_enabled }} (host_vars) - - ovs_version = {{ ovs_version }} (host_vars) - when: - - install_dpdk is defined # host_vars - - dpdk_version is defined # host_vars - - ovs_version is defined # host_vars - - ovs_dpdk_enabled is defined and ovs_dpdk_enabled # host_vars - - # Refer https://docs.openvswitch.org/en/latest/faq/releases/ to get OVS DPDK compatibility - - name: check OVS DPDK compatibility - ansible.builtin.assert: - that: - ovs_version == 'v3.2.0' and dpdk_version == '22.11.1' - or ovs_version == 'v3.1.1' and dpdk_version == '22.11.1' - or ovs_version == 'v3.0.1' and dpdk_version == '21.11.2' - or (ovs_version >= 'v2.17.0' and ovs_version <= 'v3.0.3') and (dpdk_version >= '21.11' and dpdk_version <= '22.07') - or (ovs_version < 'v2.16.2' and ovs_version >= 'v2.16.0') and dpdk_version == '21.08' - or ovs_version == 'v2.15.0' and dpdk_version == '20.11' - or ovs_version == 'v2.14.2' and dpdk_version == '19.11.6' - or ovs_version == 'v2.14.1' and dpdk_version == '19.11.6' - or ovs_version == 'v2.14.0' and dpdk_version == '19.11.6' - or ovs_version == 'v2.13.3' and dpdk_version == '19.11.6' - or ovs_version == 'v2.13.2' and dpdk_version == '19.11.6' - or ovs_version == 'v2.13.1' and dpdk_version == '19.11.6' - or ovs_version == 'v2.13.0' and dpdk_version == '19.11.6' - msg: "OVS {{ ovs_version }} does not build with DPDK version {{ dpdk_version }}. Please correct the host_vars configuration" - when: - - dpdk_version is defined # host_vars - - ovs_version is defined # host_vars - - ovs_dpdk_enabled is defined and ovs_dpdk_enabled # host_vars + - configured_arch in ['icx', 'spr'] - name: check Kubernetes Power Manager configuration ansible.builtin.include_role: @@ -1575,14 +1558,13 @@ - name: make sure isolcpus and cpusets are not enabled simultaneously ansible.builtin.assert: that: - - (not isolcpus_enabled and cpusets_enabled) or - (isolcpus_enabled and not cpusets_enabled) + - "[isolcpus_enabled, cpusets_enabled] is not all" msg: - "isolcpus_enabled and cpusets_enabled can't be enabled simultaneously." - "Please correct the host_vars configuration for target '{{ ansible_hostname }}'" when: - - (isolcpus_enabled is defined and isolcpus_enabled) or - (cpusets_enabled is defined and cpusets_enabled) + - isolcpus_enabled is defined + - cpusets_enabled is defined - name: check Intel SRIOV-FEC Operator requirements ansible.builtin.include_role: @@ -1613,10 +1595,10 @@ - name: check OS when DLB or DSA is enabled ansible.builtin.assert: that: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == '20.04' + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==') and (update_kernel or ansible_kernel[0:4] is version('5.14', '>='))) or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '22.04') or - (ansible_os_family == "RedHat" and ansible_distribution_version >= '8.6') + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.6', '>=')) success_msg: "DLB or DSA can succesfully be enabled on {{ ansible_distribution }} {{ ansible_distribution_version }}" msg: | DLB and DSA features are not supported on Ubuntu 20.04(with stock kernel) and RHEL/Rocky 8.5. @@ -1633,7 +1615,7 @@ ansible.builtin.assert: that: - ansible_distribution == "Ubuntu" - - ansible_distribution_version == "22.04" + - ansible_distribution_version is version('22.04', '==') msg: "Deploying SGX on VMRA is supported only on Ubuntu 22.04 VM host. Please change the o/s for VM host" - name: Check if configured SGX memory is not bigger than total memory @@ -1654,7 +1636,7 @@ # Storage preflight check - name: check storage persistent volumes ansible.builtin.assert: - that: "persistent_volumes != []" + that: persistent_volumes != [] msg: "Persistent volumes on target '{{ ansible_hostname }}' must be set in host_vars. Please correct the configuration" when: - local_volume_provisioner_enabled | default(false) or @@ -1721,45 +1703,23 @@ tags: intel-oneapi when: intel_oneapi_enabled | default(false) | bool -# STORY: "Observability: assert that all required compontents are enabled" - - name: assert that all observability/monitoring variables are disabled - ansible.builtin.assert: - that: - - not elasticsearch_enabled | default(false) - - not jaeger_operator | default(false) - - not opentelemetry_enabled | default(false) - - not kibana_enabled | default(false) - fail_msg: | - When both telegraf and collectd are disabled then the rest of Observability stack needs to be disabled: elastisearch, jaeger, opentelemtry and kibana. - when: - - not telegraf_enabled | default(false) - - not collectd_enabled | default(false) + - name: Check Kibana configuration + ansible.builtin.include_role: + name: kibana_install + tasks_from: preflight + when: kibana_enabled | default(false) - - name: assert that all observability/monitoring variables are enabled for telegraf - ansible.builtin.assert: - that: - - telegraf_enabled | default(false) - - elasticsearch_enabled | default(false) - - jaeger_operator | default(false) - - opentelemetry_enabled | default(false) - - kibana_enabled | default(false) - fail_msg: | - Observability needs to be enabled as a stack: telegraf, elastisearch, jaeger, opentelemtry and kibana. - when: - - telegraf_enabled | default(false) + - name: Check Jaeger configuration + ansible.builtin.include_role: + name: jaeger_install + tasks_from: preflight + when: jaeger_enabled | default(false) - - name: assert that all observability/monitoring variables are disabled for collectd - ansible.builtin.assert: - that: - - not telegraf_enabled | default(false) - - not elasticsearch_enabled | default(false) - - not jaeger_operator | default(false) - - not opentelemetry_enabled | default(false) - - not kibana_enabled | default(false) - fail_msg: | - When collectd is enabled then Observability stack needs to be disabled: telegraf, elastisearch, jaeger, opentelemtry and kibana. - when: - - collectd_enabled | default(false) + - name: Check OpenTelemetry configuration + ansible.builtin.include_role: + name: opentelemetry_install + tasks_from: preflight + when: opentelemetry_enabled | default(false) - name: Check Telegraf configuration ansible.builtin.include_role: diff --git a/playbooks/versions.yml b/playbooks/versions.yml index 0f6d95e1..b6ce2832 100644 --- a/playbooks/versions.yml +++ b/playbooks/versions.yml @@ -21,31 +21,32 @@ versions_parsing_errors_file: "{{ playbook_dir }}/../versions_parsing_errors" tasks: - name: Check if required source files exist - stat: + ansible.builtin.stat: path: '../examples/k8s/full_nfv/host_vars/node1.yml' register: examples - name: Create required source files - make: + community.general.make: chdir: "{{ playbook_dir }}/.." when: not examples.stat.exists - name: Show versions_output_file name - debug: + ansible.builtin.debug: msg: "versions_output_file is: {{ versions_output_file }}" - name: Show versions_parsing_errors_file name - debug: + ansible.builtin.debug: msg: "versions_parsing_errors_file is: {{ versions_parsing_errors_file }}" - name: Show variable values block: - name: Extract versions - shell: "echo -n '{{ item.description }}', && scripts/yaml_version_reader {{ item.var_file_path }} {{ item.shortname }}" + ansible.builtin.shell: "echo -n '{{ item.description }}', && scripts/yaml_version_reader {{ item.var_file_path }} {{ item.shortname }}" changed_when: false args: chdir: ".." register: item_value - loop: # noqa yaml[colons] yaml[indentation] + # yamllint disable rule:colons rule:indentation + loop: - { 'description' : 'Telegraf', 'shortname' : 'telegraf_image_tag', 'var_file_path' : 'roles/telegraf_install/defaults/main.yml' @@ -96,25 +97,25 @@ 'shortname' : 'rancher_version', 'var_file_path' : 'roles/rke2_kubernetes_apps/rancher/defaults/main.yml' } - - { 'description' : 'k8s node-exporter', + - { 'description' : 'Node Exporter', 'var_file_path' : 'roles/prometheus_install/defaults/main.yml', 'shortname' : 'node_exporter_version' } - - { 'description' : 'k8s prometheus-operator', + - { 'description' : 'Prometheus Operator', 'var_file_path' : 'roles/prometheus_install/kube_prometheus/defaults/main.yml', 'shortname' : "prometheus_operator_version" } - - { 'description' : 'k8s prometheus-adapter', + - { 'description' : 'Prometheus Adapter', 'var_file_path' : 'roles/prometheus_install/kube_prometheus/files/kube-prometheus-stack/prometheusAdapter-clusterRole.yaml', 'shortname' : "metadata\\'\\]\\[\\'labels\\'\\]\\[\\'app.kubernetes.io/version" } - - { 'description' : 'k8s kube-rbac-proxy', + - { 'description' : 'Kubernetes RBAC proxy', 'var_file_path' : 'roles/cluster_defaults/defaults/main.yml', 'shortname' : 'kube_rbac_proxy_image_tag' } - { 'description' : 'Node Feature Discovery', 'var_file_path' : 'roles/nfd_install/defaults/main.yml', - 'shortname' : 'nfd_image_tag' + 'shortname' : 'nfd_version' } - { 'description' : 'Vector Packet Processing', 'var_file_path' : 'roles/userspace_cni_install/defaults/main.yml', @@ -122,31 +123,31 @@ } - { 'description' : 'CNI plugins', 'shortname' : 'cni_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - - { 'description' : 'calico', + - { 'description' : 'Calico', 'shortname' : 'calico_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - - { 'description' : 'calico vpp dataplane', + - { 'description' : 'Calico (VPP dataplane)', 'shortname' : 'k8s_calico_vpp_version', 'var_file_path' : 'roles/calico_vpp_install/defaults/main.yml' } - { 'description' : 'flannel', 'shortname' : 'flannel_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - { 'description' : 'coredns', 'shortname' : 'coredns_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - { 'description' : 'krew', 'shortname' : 'krew_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - { 'description' : 'helm', 'shortname' : 'helm_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - { 'description' : 'helm on rke2', 'shortname' : 'helm_version', @@ -234,7 +235,7 @@ } - { 'description' : 'Intel® QATlib', 'shortname' : 'intel_qatlib_download_url_version', - 'var_file_path' : 'roles/bootstrap/configure_openssl/defaults/main.yml' + 'var_file_path' : 'roles/bootstrap/install_qatlibs/defaults/main.yml' } - { 'description' : 'OpenSSL QAT Engine', 'var_file_path' : 'roles/openssl_engine_install/defaults/main.yml', @@ -253,29 +254,29 @@ 'shortname' : 'sgx_sdk_version_rhel' } - { 'description' : 'Intel® KMRA', - 'var_file_path' : 'roles/kmra_install/defaults/main.yml', + 'var_file_path' : 'roles/kmra_install/defaults/main/main.yml', 'shortname' : "kmra_defaults\\'\\]\\[\\'image_tag" } - { 'description' : 'Intel® KMRA AppHSM', - 'var_file_path' : 'roles/kmra_install/defaults/main.yml', + 'var_file_path' : 'roles/kmra_install/defaults/main/main.yml', 'shortname' : "kmra_defaults\\'\\]\\[\\'apphsm\\'\\]\\[\\'image_tag", 'optional' : 'true', 'reason' : 'version is the same as for Intel® KMRA' } - { 'description' : 'Intel® KMRA PCCS', - 'var_file_path' : 'roles/kmra_install/defaults/main.yml', + 'var_file_path' : 'roles/kmra_install/defaults/main/main.yml', 'shortname' : "kmra_defaults\\'\\]\\[\\'pccs\\'\\]\\[\\'image_tag", 'optional' : 'true', 'reason' : 'version is the same as for Intel® KMRA' } - { 'description' : 'Intel® KMRA CTK', - 'var_file_path' : 'roles/kmra_install/defaults/main.yml', + 'var_file_path' : 'roles/kmra_install/defaults/main/main.yml', 'shortname' : "kmra_defaults\\'\\]\\[\\'ctk_loadkey_demo\\'\\]\\[\\'image_tag", 'optional' : 'true', 'reason' : 'version is the same as for Intel® KMRA' } - { 'description' : 'Intel® KMRA CTK nginx', - 'var_file_path' : 'roles/kmra_install/defaults/main.yml', + 'var_file_path' : 'roles/kmra_install/defaults/main/main.yml', 'shortname' : "kmra_defaults\\'\\]\\[\\'ctk_loadkey_demo\\'\\]\\[\\'nginx_image_tag", 'optional' : 'true', 'reason' : 'version is the same as for Intel® KMRA' @@ -332,9 +333,9 @@ 'var_file_path' : 'roles/minio_install/defaults/main.yaml', 'shortname' : "minio_git_tag" } - - { 'description' : 'Power Manager Operator', - 'var_file_path' : 'roles/intel_power_manager/defaults/main.yml', - 'shortname' : 'intel_power_manager_git_ref' + - { 'description' : 'Kubernetes Power Manager', + 'var_file_path' : 'roles/kubernetes_power_manager/defaults/main.yml', + 'shortname' : 'kubernetes_power_manager_git_ref' } - { 'description' : 'Intel SR-IOV FEC Operator', 'var_file_path' : 'roles/intel_sriov_fec_operator/defaults/main.yml', @@ -344,7 +345,7 @@ 'var_file_path' : 'roles/intel_sriov_fec_operator/defaults/main.yml', 'shortname' : 'intel_sriov_fec_operator_img_ver' } - - { 'description' : 'FEC Operator SDK', + - { 'description' : 'Operator SDK', 'var_file_path' : 'roles/operator_framework/defaults/main.yml', 'shortname' : 'operator_sdk_git_ref' } @@ -361,8 +362,8 @@ 'shortname' : 'dpdk_version' } - { 'description' : 'Open vSwitch with DPDK', - 'var_file_path' : 'examples/k8s/full_nfv/host_vars/node1.yml', - 'shortname' : 'ovs_version' + 'var_file_path' : 'roles/userspace_cni_install/defaults/main.yml', + 'shortname' : 'default_ovs_version' } - { 'description' : 'Intel® QAT Drivers', 'var_file_path' : 'roles/bootstrap/install_qat_drivers_services/defaults/main.yml', @@ -380,28 +381,24 @@ 'var_file_path' : 'roles/linkerd_service_mesh/defaults/main.yml', 'shortname' : 'linkerd_version' } - - { 'description' : 'cAdvisor helm chart', - 'var_file_path' : 'roles/cadvisor_install/defaults/main.yaml', - 'shortname' : 'cadvisor_helm_chart_version' - } - { 'description' : 'cAdvisor', 'var_file_path' : 'roles/cadvisor_install/defaults/main.yaml', - 'shortname' : 'cadvisor_image_version' + 'shortname' : 'cadvisor_version' } - - { 'description' : 'Intel® adq dp', + - { 'description' : 'Intel® ADQ Device Plugins', 'var_file_path' : 'roles/adq_dp_install/defaults/main.yml', 'shortname' : 'intel_adq_dp_version' } - - { 'description' : 'adq ice fw', + - { 'description' : 'ADQ ICE firmware', 'var_file_path' : 'roles/bootstrap/update_nic_firmware/defaults/main.yml', 'shortname' : 'adq_ice_fw_required_version' } - { 'description' : 'cilium', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml', + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml', 'shortname' : 'cilium_version' } - { 'description' : 'cert manager', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml', + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml', 'shortname' : 'cert_manager_version' } - { 'description' : 'cert manager on rke2', @@ -428,11 +425,11 @@ 'var_file_path' : 'roles/container_engine/crio/defaults/main.yml', 'shortname' : 'crio_version' } - - { 'description' : 'registry', + - { 'description' : 'container registry', 'shortname' : 'registry_version', 'var_file_path' : 'roles/container_registry/defaults/main.yml' } - - { 'description' : 'nginx', + - { 'description' : 'nginx web server for container registry', 'shortname' : 'registry_nginx_version', 'var_file_path' : 'roles/container_registry/defaults/main.yml' } @@ -448,16 +445,12 @@ 'shortname' : 'golang_version', 'var_file_path' : 'roles/bootstrap/golang_install/defaults/main.yml' } - - { 'description' : 'cluster_name', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/kubespray-defaults/defaults/main.yaml', - 'shortname' : 'cluster_name' - } - { 'description' : 'containerd', 'var_file_path' : 'roles/container_engine/containerd/defaults/main.yml', 'shortname' : 'containerd_version' } - { 'description' : 'multus', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml', + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml', 'shortname' : 'multus_version' } - { 'description' : 'nfd', @@ -466,32 +459,24 @@ } - { 'description' : 'weave', 'shortname' : 'weave_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' - } - - { 'description' : 'kube-vip', - 'shortname' : 'kube_vip_image_tag', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' - } - - { 'description' : 'nginx-ingress', - 'shortname' : 'kubernetes_ingress_helm_chart_version', - 'var_file_path' : 'roles/kubernetes_ingress_install/defaults/main.yml' + 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main/main.yml' } - - { 'description' : 'argocd', - 'shortname' : 'argocd_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/kubernetes-apps/argocd/defaults/main.yml' - } - - { 'description' : 'metallb', - 'shortname' : 'metallb_version', - 'var_file_path' : 'collections/ansible_collections/kubernetes_sigs/kubespray/roles/download/defaults/main.yml' + - { 'description' : 'ingress_nginx_helm_chart', + 'shortname' : 'ingress_nginx_helm_version', + 'var_file_path' : 'roles/ingress_nginx_install/defaults/main.yml' } - { 'description' : 'kibana', 'shortname' : 'kibana_chart_version', 'var_file_path' : 'roles/kibana_install/defaults/main.yml' } - - { 'description' : 'Rook/Ceph', + - { 'description' : 'Rook/Ceph-Rook', 'var_file_path' : 'roles/rook_install/defaults/main.yml', 'shortname' : "rook_git_tag" } + - { 'description' : 'Rook/Ceph-Ceph', + 'var_file_path' : 'roles/rook_install/defaults/main.yml', + 'shortname' : "ceph_version" + } - { 'description' : 'FFmpeg', 'var_file_path' : 'roles/ffmpeg_install/defaults/main.yml', 'shortname' : "ffmpeg_commit_hash" @@ -520,8 +505,29 @@ 'var_file_path' : 'roles/imtl_install/defaults/main.yml', 'shortname' : 'imtl_version' } + - { 'description' : 'KubeVirt', + 'var_file_path' : 'roles/kubevirt_install/defaults/main.yml', + 'shortname' : 'kubevirt_version' + } + - { 'description' : 'Infrastructure Power Manager DPDK patches', + 'var_file_path' : 'roles/infrastructure_power_manager/defaults/main.yml', + 'shortname' : 'ipm_dpdk_patches_commit' + } + - { 'description' : 'ECK (Elasticsearch on Kubernetes) cluster version', + 'var_file_path' : 'roles/eck_install/defaults/main.yml', + 'shortname' : 'eck_version' + } + - { 'description' : 'Elasticsearch', + 'var_file_path' : 'roles/eck_install/defaults/main.yml', + 'shortname' : 'eck_elasticsearch_version' + } + - { 'description' : 'Intel In-Band Manageability Framework', + 'var_file_path' : 'roles/intel_inband_manageability/defaults/main.yml', + 'shortname' : 'intel_inbm_git_tag' + } + # yamllint enable rule:colons rule:indentation - name: Remove old version parsing results - file: + ansible.builtin.file: path: "{{ item }}" state: absent failed_when: false @@ -529,7 +535,7 @@ - "{{ versions_output_file }}" - "{{ versions_parsing_errors_file }}" - name: Write versions into output file - lineinfile: + ansible.builtin.lineinfile: path: "{{ versions_output_file }}" line: >- "{{ item.stdout }}{% if (item.stderr and item.item.optional | default(false)) or (item.stdout and item.item.optional | default(false) and @@ -538,7 +544,7 @@ create: yes loop: "{{ item_value.results }}" - name: Write version parsing errors into errors file - lineinfile: + ansible.builtin.lineinfile: path: '{{ versions_parsing_errors_file }}' line: "{{ item.stdout }}\nParameter {{ item.item.shortname }} in file {{ item.item.var_file_path }} was not found\n{{ item.stderr }}\n" mode: 0644 @@ -548,10 +554,12 @@ - item.stderr - not item.item.optional | default(false) - name: Add ddp_profile variable - shell: set -o pipefail | grep ddp_profile ../examples/k8s/full_nfv/host_vars/node1.yml |head -n1| awk '{print $3}' + ansible.builtin.shell: + cmd: set -o pipefail && grep ddp_profile ../examples/k8s/full_nfv/host_vars/node1.yml |head -n1| awk '{print $3}' + executable: /bin/bash changed_when: false register: ddp_profile - name: Add ddp_profile variable - lineinfile: + ansible.builtin.lineinfile: path: "{{ versions_output_file }}" line: "ddp_profile,{{ ddp_profile.stdout }}" diff --git a/playbooks/vm.yml b/playbooks/vm.yml index 7cdd9c22..4028679f 100644 --- a/playbooks/vm.yml +++ b/playbooks/vm.yml @@ -16,6 +16,8 @@ --- # VM enabled # If VM is enabled then Virtual Machines are created and CEK is deployed into those VMs +- name: SRIOV NIC and SRIOV QAT auto-detection + import_playbook: autodetect.yml - name: preflight checks import_playbook: preflight.yml - name: configure target hosts OS layer diff --git a/requirements.txt b/requirements.txt index 113e3dcb..58c365fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -ansible==7.7.0 -ansible-core==2.14.9 +ansible==8.6.1 +ansible-core==2.15.8 cryptography==41.0.6 -jinja2==3.1.2 -netaddr==0.8.0 -pbr==5.11.1 +jinja2==3.1.3 +netaddr==0.9.0 +paramiko==3.4.0 +pbr==6.0.0 jmespath==1.0.1 -ruamel.yaml==0.17.32 -ruamel.yaml.clib==0.2.7 +ruamel.yaml==0.18.5 +ruamel.yaml.clib==0.2.8 MarkupSafe==2.1.3 -ipaddr diff --git a/roles/adq_dp_install/tasks/main.yml b/roles/adq_dp_install/tasks/main.yml index 6ec59832..e987c9f8 100644 --- a/roles/adq_dp_install/tasks/main.yml +++ b/roles/adq_dp_install/tasks/main.yml @@ -25,17 +25,31 @@ "type": "adq-cni", "tunneling": "vxlan", "tunneling-interface": "vxlan.calico", - "kubeletServerName": "{{ kube_service_addresses | ipaddr('next_usable') }}", + "kubeletServerName": "{{ kube_service_addresses | ansible.utils.ipaddr('next_usable') }}", "kubeletPort": "443", "kubeletCAPath": "/etc/kubernetes/pki/ca.crt" } - block: - - name: restart Calico pods - ansible.builtin.shell: "kubectl delete pod $(kubectl get pods -n kube-system | grep calico | awk '{ print $1 }') -n kube-system" - args: - executable: /bin/bash - changed_when: true + - name: Find calico pods + kubernetes.core.k8s_info: + kind: Pod + namespace: kube-system + label_selectors: + - k8s-app in (calico-node, calico-kube-controllers) + register: calico_pods_stat + no_log: true + + - name: Restart calico pods + kubernetes.core.k8s: + kind: Pod + namespace: kube-system + state: absent + name: "{{ item.metadata.name }}" + loop: "{{ calico_pods_stat.resources }}" + when: + - calico_pods_stat.resources is defined + - calico_pods_stat.resources | length > 0 - name: create Intel ADQ Device Plugin directory ansible.builtin.file: diff --git a/roles/bond_cni_install/defaults/main.yml b/roles/bond_cni_install/defaults/main.yml index eda20b84..6b15a3bc 100644 --- a/roles/bond_cni_install/defaults/main.yml +++ b/roles/bond_cni_install/defaults/main.yml @@ -16,4 +16,4 @@ --- bond_cni_git_url: "https://github.com/intel/bond-cni.git" bond_cni_dir: "{{ (project_root_dir, 'bond-cni') | path_join }}" -bond_cni_version: "1578bc19a1cb8ea137abab77f682fc3d5ebfdc54" +bond_cni_version: "408b549d88c6395b50ef20bf3195c7e1a8766a47" diff --git a/roles/bootstrap/allocate_cpus/tasks/main.yml b/roles/bootstrap/allocate_cpus/tasks/main.yml index 951ac059..a4cb57f9 100644 --- a/roles/bootstrap/allocate_cpus/tasks/main.yml +++ b/roles/bootstrap/allocate_cpus/tasks/main.yml @@ -14,6 +14,16 @@ ## limitations under the License. ## --- +- name: Remove old cpu allocations + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_fileglob: + - "~/.cpupin/{{ hostvars[inventory_hostname]['ansible_hostname'] }}_*" + delegate_to: localhost + become: false + when: vm_recreate_existing | default(true) + - name: Allocate requested number of CPUs cpupin: name: "{{ item.name }}" @@ -23,6 +33,7 @@ number_host_os: "{{ cpu_host_os if cpu_host_os is defined else omit }}" alloc_all: "{{ item.alloc_all if item.alloc_all is defined else omit }}" pinning: false + host_name: "{{ hostvars[inventory_hostname]['ansible_hostname'] }}" loop: "{{ vms }}" changed_when: true register: allocated_cpus diff --git a/roles/bootstrap/apply_kubernetes_reqs/tasks/main.yml b/roles/bootstrap/apply_kubernetes_reqs/tasks/main.yml index 976fc3a8..05d3d7dc 100644 --- a/roles/bootstrap/apply_kubernetes_reqs/tasks/main.yml +++ b/roles/bootstrap/apply_kubernetes_reqs/tasks/main.yml @@ -34,8 +34,9 @@ changed_when: false become: yes +# service_facts module does not gather .swap unit files - name: get swap service unit name - shell: set -o pipefail | systemctl list-unit-files | grep \.swap | awk '{print $1}' # noqa command-instead-of-module + ansible.builtin.shell: set -o pipefail | systemctl list-unit-files | grep \.swap | awk '{print $1}' # noqa command-instead-of-module args: executable: /bin/bash register: unit_name @@ -43,14 +44,14 @@ become: yes - name: stop swap service - systemd: + ansible.builtin.systemd: state: stopped name: "{{ unit_name.stdout }}" become: yes when: '"swap" in unit_name.stdout' - name: mask swap service to /dev/null - systemd: + ansible.builtin.systemd: name: "{{ unit_name.stdout }}" masked: yes become: yes diff --git a/roles/bootstrap/auto_detect_nic_devices/tasks/main.yml b/roles/bootstrap/auto_detect_nic_devices/tasks/main.yml new file mode 100644 index 00000000..5bf9f686 --- /dev/null +++ b/roles/bootstrap/auto_detect_nic_devices/tasks/main.yml @@ -0,0 +1,99 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: user configured dataplane_interfaces + ansible.builtin.debug: var=dataplane_interfaces + +- name: initializing new dataplane_interfaces list + ansible.builtin.set_fact: + new_dataplane_interfaces: [] + +- name: get supported NIC PF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ nic_supported_pf_dev_ids | join('|') }}'" + executable: /bin/bash + register: nic_pci_pf_devices + changed_when: false + failed_when: nic_pci_pf_devices.rc not in [0, 1] + when: not on_vms | default(false) + +- name: get supported NIC VF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ nic_supported_vf_dev_ids | join('|') }}'" + executable: /bin/bash + register: nic_pci_vf_devices + changed_when: false + failed_when: nic_pci_vf_devices.rc not in [0, 1] + when: on_vms | default(false) + +- name: get NIC device bus IDs + ansible.builtin.set_fact: + nic_pci_devices: "{{ (nic_pci_vf_devices.stdout_lines if (on_vms | default(false)) else nic_pci_pf_devices.stdout_lines) | + map('split') | map('first') }}" + +- name: get NIC device uevent information + ansible.builtin.slurp: + src: "{{ ('/sys/bus/pci/devices', ('0000:' + item), 'uevent') | path_join }}" + with_items: "{{ nic_pci_devices }}" + register: nic_dev_uevent_reg + when: not on_vms | default(false) + +- name: read sriov_totalvfs for each NIC PF + ansible.builtin.slurp: + src: "{{ ('/sys/bus/pci/devices', ('0000:' + item), 'sriov_totalvfs') | path_join }}" + with_items: "{{ nic_pci_devices }}" + register: nic_dev_sriov_total_vfs_reg + when: not on_vms | default(false) + +- name: create new_dataplane_interfaces list + ansible.builtin.set_fact: + new_dataplane_interfaces: |- + [ + {% for uevent, total_vfs in nic_dev_uevent_reg.results | zip(nic_dev_sriov_total_vfs_reg.results) %} + { + "pf_driver": "{{ uevent.content | b64decode | regex_search('DRIVER=(.+)', '\1') | first }}", + "bus_info": "{{ (uevent.content | b64decode | regex_search('PCI_SLOT_NAME=(.+)', '\1') | first)[5:] }}", + "ddp_profile": "{{ default_ddp_profile }}", + "default_vf_driver": "{{ dataplane_interface_default_vf_driver }}", + "sriov_numvfs": {{ total_vfs.content | b64decode | trim }}, + "sriov_vfs": {}, + }, + {% endfor %} + ] + when: not on_vms | default(false) + +- name: create new_dataplane_interfaces list on VMs + ansible.builtin.set_fact: + new_dataplane_interfaces: |- + [ + {% for dev in nic_pci_devices %} + { + "pf_driver": "iavf", + "bus_info": "{{ dev }}", + "default_vf_driver": "iavf", + "sriov_numvfs": 0, + }, + {% endfor %} + ] + when: on_vms | default(false) + +- name: replace original dataplane_interfaces + ansible.builtin.set_fact: + dataplane_interfaces: "{{ new_dataplane_interfaces }}" + +- name: print new dataplane_interfaces + ansible.builtin.debug: + var: dataplane_interfaces diff --git a/roles/bootstrap/auto_detect_nic_devices/tasks/preflight.yml b/roles/bootstrap/auto_detect_nic_devices/tasks/preflight.yml new file mode 100644 index 00000000..acb798bb --- /dev/null +++ b/roles/bootstrap/auto_detect_nic_devices/tasks/preflight.yml @@ -0,0 +1,49 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: get count of configured NIC devices + ansible.builtin.set_fact: + user_configured_nic_devices_count: "{{ vms | map(attribute='nic_devices_count', default=0) | sum }}" + +- name: get supported NIC PF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ nic_supported_pf_dev_ids | join('|') }}'" + executable: /bin/bash + register: nic_pci_pf_devices + changed_when: false + failed_when: nic_pci_pf_devices.rc not in [0, 1] + when: not on_vms | default(false) + +- name: get NIC PF device bus IDs + ansible.builtin.set_fact: + nic_pci_devices: "{{ ['0000:'] | product(nic_pci_pf_devices.stdout_lines | map('split') | map('first'))| map('join') | list }}" + +- name: read sriov_totalvfs for each NIC PF + ansible.builtin.slurp: + src: "{{ ('/sys/bus/pci/devices', item, 'sriov_totalvfs') | path_join }}" + with_items: "{{ nic_pci_devices }}" + register: nic_dev_sriov_total_vfs_reg + +- name: get sum of NIC devices SRIOV VFs available + ansible.builtin.set_fact: + nic_device_sriov_vfs_sum: "{{ (nic_device_sriov_vfs_sum | d(0) | int) + (item.content | b64decode | trim | int) }}" + with_items: "{{ nic_dev_sriov_total_vfs_reg.results }}" + +- name: check if we have enough NIC VFs available + ansible.builtin.assert: + that: (user_configured_nic_devices_count | int) <= (nic_device_sriov_vfs_sum | int) + fail_msg: + "You have configured more NIC devices for VMs ({{ user_configured_nic_devices_count }}) than is available on system ({{ nic_device_sriov_vfs_sum }})." diff --git a/roles/bootstrap/auto_detect_qat_devices/tasks/main.yml b/roles/bootstrap/auto_detect_qat_devices/tasks/main.yml index 6c1989e1..9925cca7 100644 --- a/roles/bootstrap/auto_detect_qat_devices/tasks/main.yml +++ b/roles/bootstrap/auto_detect_qat_devices/tasks/main.yml @@ -15,69 +15,90 @@ ## --- - name: user configured qat_devices - debug: var=qat_devices + ansible.builtin.debug: var=qat_devices - name: initializing new qat_devices list - set_fact: + ansible.builtin.set_fact: new_qat_devices: [] -- name: detect QAT PF ids from system - shell: "set -o pipefail && lspci -nn | egrep -i '{{ qat_supported_pf_dev_ids | join('|') }}' | cut -d' ' -f1" - args: +- name: get supported QAT PF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ qat_supported_pf_dev_ids | join('|') }}'" executable: /bin/bash - register: qat_pf_id_values + register: qat_pci_pf_devices changed_when: false - failed_when: qat_pf_id_values.rc != 0 and qat_pf_id_values.rc != 1 + failed_when: qat_pci_pf_devices.rc not in [0, 1] when: not on_vms | default(false) -# QAT PF can be possibly added to VM as well -- name: detect QAT VF ids from system - shell: "set -o pipefail && lspci -nn | egrep -i '{{ (qat_supported_vf_dev_ids + qat_supported_pf_dev_ids) | join('|') }}' | cut -d' ' -f1" - args: +- name: get supported QAT VF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ qat_supported_vf_dev_ids | join('|') }}'" executable: /bin/bash - register: qat_vf_id_values + register: qat_pci_vf_devices changed_when: false - failed_when: qat_vf_id_values.rc != 0 and qat_vf_id_values.rc != 1 + failed_when: qat_pci_vf_devices.rc not in [0, 1] when: on_vms | default(false) -- name: change qat_sriov_numvfs_required to zero for VMs - set_fact: - qat_sriov_numvfs_required: 0 - when: on_vms | default(false) +- name: get QAT device bus IDs + ansible.builtin.set_fact: + qat_pci_devices: "{{ ['0000:'] | + product((qat_pci_vf_devices.stdout_lines if (on_vms | default(false)) else qat_pci_pf_devices.stdout_lines) | + map('split') | map('first'))| map('join') | list }}" -- name: set common qat_id_values - set_fact: - qat_id_values: "{% if on_vms | default(false) %}{{ qat_vf_id_values.stdout_lines }}{% else %}{{ qat_pf_id_values.stdout_lines }}{% endif %}" +- name: read sriov_totalvfs for each QAT PF + ansible.builtin.slurp: + src: "{{ ('/sys/bus/pci/devices', item, 'sriov_totalvfs') | path_join }}" + with_items: "{{ qat_pci_devices }}" + register: qat_dev_sriov_total_vfs_reg + when: not on_vms | default(false) -- name: print QAT PF id in system - debug: - var: qat_id_values +- name: get QAT device max SRIOV VFs + ansible.builtin.set_fact: + qat_device_sriov_total_vfs: "{{ qat_device_sriov_total_vfs | d({}) | combine({ item.item: (item.content | b64decode | trim) }) }}" + with_items: "{{ qat_dev_sriov_total_vfs_reg.results }}" + when: not on_vms | default(false) -- name: print configured values - debug: - msg: - - "qat_sriov_numvfs_required = {{ qat_sriov_numvfs_required }}" - - "qat_vf_driver_required = {{ qat_vf_driver_required }}" +- name: get QAT device uevent information + ansible.builtin.slurp: + src: "{{ ('/sys/bus/pci/devices', item, 'uevent') | path_join }}" + with_items: "{{ qat_pci_devices }}" + register: qat_dev_uevent_reg + when: not on_vms | default(false) -- name: Create new_qat_devices list - set_fact: - new_qat_devices: - "{{ new_qat_devices | default([]) + [{ 'qat_id' : '0000:' + item, 'qat_sriov_numvfs' : qat_sriov_numvfs_required, - 'qat_default_vf_driver' : qat_vf_driver_required }] }}" - with_items: "{{ qat_id_values }}" +- name: create new_qat_devices list + ansible.builtin.set_fact: + new_qat_devices: |- + [ + {% for uevent, total_vfs in qat_dev_uevent_reg.results | zip(qat_dev_sriov_total_vfs_reg.results) %} + { + "qat_id": "{{ uevent.item }}", + "qat_sriov_numvfs": {{ total_vfs.content | b64decode | trim }}, + "qat_default_vf_driver": "{{ uevent.content | b64decode | regex_search('DRIVER=(.+)', '\1') | default([qat_vf_driver_required[:-2]]) | first }}vf", + "qat_vfs": {} + }, + {% endfor %} + ] changed_when: true when: not on_vms | default(false) -- name: Create new_qat_devices list on VMs - set_fact: - new_qat_devices: "{{ new_qat_devices | default([]) + [{ 'qat_id' : '0000:' + item, 'qat_sriov_numvfs' : qat_sriov_numvfs_required }] }}" - with_items: "{{ qat_id_values }}" +- name: create new_qat_devices list for VMs + ansible.builtin.set_fact: + new_qat_devices: |- + [ + {% for item in qat_pci_devices %} + { + "qat_id": "{{ item }}", + "qat_sriov_numvfs": 0 + }, + {% endfor %} + ] changed_when: true when: on_vms | default(false) - name: replace original qat_devices - set_fact: + ansible.builtin.set_fact: qat_devices: "{{ new_qat_devices }}" -- name: auto detected qat_devices - debug: var=qat_devices +- name: print new qat_devices + ansible.builtin.debug: + var: qat_devices diff --git a/roles/bootstrap/auto_detect_qat_devices/tasks/preflight.yml b/roles/bootstrap/auto_detect_qat_devices/tasks/preflight.yml new file mode 100644 index 00000000..2cc5ab3b --- /dev/null +++ b/roles/bootstrap/auto_detect_qat_devices/tasks/preflight.yml @@ -0,0 +1,48 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: get count of configured QAT devices + ansible.builtin.set_fact: + user_configured_qat_devices_count: "{{ vms | map(attribute='qat_devices_count', default=0) | sum }}" + +- name: get supported QAT PF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ qat_supported_pf_dev_ids | join('|') }}'" + executable: /bin/bash + register: qat_pci_pf_devices + changed_when: false + failed_when: qat_pci_pf_devices.rc not in [0, 1] + +- name: get QAT PF device bus IDs + ansible.builtin.set_fact: + qat_pci_devices: "{{ ['0000:'] | product(qat_pci_pf_devices.stdout_lines | map('split') | map('first'))| map('join') | list }}" + +- name: read sriov_totalvfs for each QAT PF + ansible.builtin.slurp: + src: "{{ ('/sys/bus/pci/devices', item, 'sriov_totalvfs') | path_join }}" + with_items: "{{ qat_pci_devices }}" + register: qat_dev_sriov_total_vfs_reg + +- name: get sum of QAT devices SRIOV VFs available + ansible.builtin.set_fact: + qat_device_sriov_vfs_sum: "{{ (qat_device_sriov_vfs_sum | d(0) | int) + (item.content | b64decode | trim | int) }}" + with_items: "{{ qat_dev_sriov_total_vfs_reg.results }}" + +- name: check if we have enough QAT VFs available + ansible.builtin.assert: + that: (user_configured_qat_devices_count | int) <= (qat_device_sriov_vfs_sum | int) + fail_msg: + "You have configured more QAT devices for VMs ({{ user_configured_qat_devices_count }}) than is available on system ({{ qat_device_sriov_vfs_sum }})." diff --git a/roles/bootstrap/configure_disks/tasks/configure_loopdevices.yml b/roles/bootstrap/configure_disks/tasks/configure_loopdevices.yml index ed063d9b..9bd25842 100644 --- a/roles/bootstrap/configure_disks/tasks/configure_loopdevices.yml +++ b/roles/bootstrap/configure_disks/tasks/configure_loopdevices.yml @@ -58,6 +58,9 @@ force: yes state: present with_sequence: start=1 end={{ req_num }} stride=1 + when: + - minio_enabled | default(false) | bool or + local_volume_provisioner_enabled | default(false) | bool - name: create mounted folder ansible.builtin.file: @@ -65,6 +68,9 @@ state: directory mode: '0755' with_sequence: start=1 end={{ req_num }} stride=1 + when: + - minio_enabled | default(false) | bool or + local_volume_provisioner_enabled | default(false) | bool - name: populate loopdevice_bind script ansible.builtin.template: diff --git a/roles/bootstrap/configure_disks/templates/loopdevice_bind.j2 b/roles/bootstrap/configure_disks/templates/loopdevice_bind.j2 index c3b8dcd8..1918a4d0 100644 --- a/roles/bootstrap/configure_disks/templates/loopdevice_bind.j2 +++ b/roles/bootstrap/configure_disks/templates/loopdevice_bind.j2 @@ -3,15 +3,28 @@ max_devices={{ simulated_disk_num }} bind_loopdevices() { +{% if local_volume_provisioner_enabled | default(false) or minio_enabled | default(false) %} for i in `seq 1 $max_devices` do mount -t ext4 -o loop /opt/cek/disks/tmp/diskimage$i /mnt/disks/ra_disk$i echo "mounted /opt/cek/disks/tmp/diskimage$i to /mnt/disks/ra_disk$i" done +{% elif rook_ceph.enabled | default(false) %} + for i in `seq 1 $max_devices` + do + losetup -f /opt/cek/disks/tmp/diskimage$i + echo "create loop device to /opt/cek/disks/tmp/diskimage$i" + done +{% endif %} } detach_loopdevices() { +{% if local_volume_provisioner_enabled | default(false) or minio_enabled | default(false) %} umount /mnt/disks/ra_disk? +{% elif rook_ceph.enabled | default(false) %} + losetup -D + echo "detached loopdevices" +{% endif %} } option=${1:-""} diff --git a/roles/bootstrap/configure_dlb/defaults/main.yml b/roles/bootstrap/configure_dlb/defaults/main.yml index ac9212c2..4cf4fd20 100644 --- a/roles/bootstrap/configure_dlb/defaults/main.yml +++ b/roles/bootstrap/configure_dlb/defaults/main.yml @@ -14,6 +14,6 @@ ## limitations under the License. ## --- -intel_dlb_driver_ver: "dlb_linux_src_release_8.5.1" -intel_dlb_driver_url: "https://downloadmirror.intel.com/787629/{{ intel_dlb_driver_ver }}.txz" -intel_dlb_driver_checksum: "sha1:53BA0D21CDB5EBAAAB247FC9C73FAED7F0899B95" +intel_dlb_driver_ver: "dlb_linux_src_release_8.7.0" +intel_dlb_driver_url: "https://downloadmirror.intel.com/795608/{{ intel_dlb_driver_ver }}.txz" +intel_dlb_driver_checksum: "sha1:DCFE0918F7EBC3B62408DA3143642935127441DA" diff --git a/roles/bootstrap/configure_dlb/tasks/main.yml b/roles/bootstrap/configure_dlb/tasks/main.yml index 10a7e465..09b9398d 100644 --- a/roles/bootstrap/configure_dlb/tasks/main.yml +++ b/roles/bootstrap/configure_dlb/tasks/main.yml @@ -15,107 +15,115 @@ ## --- - name: install dependencies for Intel DLB driver - include_role: + ansible.builtin.include_role: name: install_dependencies # dependencies are not limited to packages - name: insert mdev module - modprobe: + community.general.modprobe: name: mdev state: present - name: load mdev module on boot - lineinfile: + ansible.builtin.lineinfile: path: /etc/modules-load.d/mdev.conf line: mdev - create: yes - mode: 0644 - become: yes + create: true + mode: '0644' + become: true # build and install Intel DLB driver - name: download DLB driver - become: yes - get_url: + become: true + ansible.builtin.get_url: url: "{{ intel_dlb_driver_url }}" dest: "{{ project_root_dir }}" checksum: "{{ intel_dlb_driver_checksum }}" timeout: 60 - mode: 0644 + mode: '0644' register: dlb_download until: dlb_download is not failed retries: 5 - name: untar DLB driver on Ubuntu - unarchive: + ansible.builtin.unarchive: src: "{{ dlb_download.dest }}" dest: "{{ project_root_dir }}" - list_files: yes - remote_src: yes - mode: 0774 - become: yes + list_files: true + remote_src: true + mode: '0774' + become: true when: ansible_os_family == "Debian" # Ansible built-in unarchive not working as expected in RHEL / Rocky using shell as alternative - name: extract DLB driver package on RHEL / Rocky - shell: "tar --xz -xf {{ intel_dlb_driver_ver }}.txz" # noqa command-instead-of-module + ansible.builtin.shell: "tar --xz -xf {{ intel_dlb_driver_ver }}.txz" # noqa command-instead-of-module args: chdir: "{{ project_root_dir }}" executable: /bin/bash + changed_when: true when: ansible_os_family == "RedHat" # Build workaround on Rocky 9.1/9.2 # Driver's Makefile condition on disabling SIOV on RHEL does not work on Rocky. As a workaround disabling SIOV manually. -- name: "Disable SIOV on Rocky 9.1 (workaround)" +- name: "Disable SIOV on Rocky 9.1/9.2 (workaround)" ansible.builtin.replace: dest: "{{ project_root_dir }}/dlb/driver/dlb2/Makefile" regexp: '^\s*ccflags-y \+= -DCONFIG_INTEL_DLB2_SIOV' replace: '#ccflags-y += -DCONFIG_INTEL_DLB2_SIOV' when: - ansible_distribution == 'Rocky' - - ansible_distribution_version == '9.1' or ansible_distribution_version == '9.2' + - ansible_distribution_version in ['9.1', '9.2'] - name: build Intel DLB driver - make: + community.general.make: chdir: "{{ project_root_dir }}/dlb/driver/dlb2" - name: check if DLB module is loaded - command: lsmod + ansible.builtin.command: lsmod register: dlb_module failed_when: false changed_when: false - name: insert DLB module - command: insmod dlb2.ko + ansible.builtin.command: insmod dlb2.ko args: chdir: "{{ project_root_dir }}/dlb/driver/dlb2" + changed_when: true when: "'dlb' not in dlb_module.stdout" +- name: create dlb2 module directory in kernel drivers + ansible.builtin.file: + path: "/usr/lib/modules/{{ ansible_kernel }}/kernel/drivers/dlb2" + state: directory + mode: '0755' + - name: link dlb2 module to kernel drivers - file: + ansible.builtin.file: state: link - src: "{{ project_root_dir }}/dlb/driver/dlb2" - dest: "/usr/lib/modules/{{ ansible_kernel }}/kernel/drivers/dlb2" - force: yes - mode: 0644 + src: "{{ project_root_dir }}/dlb/driver/dlb2/dlb2.ko" + dest: "/usr/lib/modules/{{ ansible_kernel }}/kernel/drivers/dlb2/dlb2.ko" + force: true + mode: '0644' - name: setup DLB module loading on boot - lineinfile: + ansible.builtin.lineinfile: path: /etc/modules-load.d/dlb2.conf line: dlb2 - create: yes - mode: 0644 - become: yes + create: true + mode: '0644' + become: true - name: check if DLB devices are present on the system - find: + ansible.builtin.find: path: /dev file_type: any - use_regex: yes + use_regex: true pattern: "^(dlb)[0-9]*$" # devices have to start with "dlb" followed by the ID at the end register: dlb_devices - name: assert DLB devices presence - assert: + ansible.builtin.assert: that: - dlb_devices.matched > 0 fail_msg: diff --git a/roles/bootstrap/configure_dsa/defaults/main.yml b/roles/bootstrap/configure_dsa/defaults/main.yml index 13a58d80..641fd521 100644 --- a/roles/bootstrap/configure_dsa/defaults/main.yml +++ b/roles/bootstrap/configure_dsa/defaults/main.yml @@ -16,6 +16,6 @@ --- idxd_config_git_url: "https://github.com/intel/idxd-config.git" idxd_config_dir: "/usr/src/idxd-config" -idxd_config_git_ref: "accel-config-v4.0" +idxd_config_git_ref: "accel-config-v4.1.4" dsa_devices_dir: "/sys/bus/dsa/devices/" diff --git a/roles/bootstrap/configure_dsa/tasks/dsa_custom_config.yml b/roles/bootstrap/configure_dsa/tasks/dsa_custom_config.yml index 9fe389d2..3db116d5 100644 --- a/roles/bootstrap/configure_dsa/tasks/dsa_custom_config.yml +++ b/roles/bootstrap/configure_dsa/tasks/dsa_custom_config.yml @@ -114,12 +114,10 @@ - name: configure Work Queues include_tasks: wqs_custom_config.yml vars: - WQ: "{{ work_queue }}" # noqa var-naming + work_queue: "{{ item }}" dsa_id: "{{ dsa_dev_id }}" max_single_wq_size: "{{ max_wq_size | int }}" loop: "{{ dsa_device.wqs }}" - loop_control: - loop_var: work_queue - name: enable device {{ dsa_device.name }} command: accel-config enable-device {{ dsa_device.name }} diff --git a/roles/bootstrap/configure_dsa/tasks/main.yml b/roles/bootstrap/configure_dsa/tasks/main.yml index 976f9a1b..1a0f8070 100644 --- a/roles/bootstrap/configure_dsa/tasks/main.yml +++ b/roles/bootstrap/configure_dsa/tasks/main.yml @@ -15,14 +15,14 @@ ## --- - name: install dependencies for Intel DSA devices - include_role: + ansible.builtin.include_role: name: install_dependencies - name: install accel-config tool - include_tasks: install_accel_config.yml + ansible.builtin.include_tasks: install_accel_config.yml - name: get number of DSA devices - find: + ansible.builtin.find: paths: "{{ dsa_devices_dir }}" file_type: any use_regex: yes @@ -31,7 +31,7 @@ register: found_dsa_devices - name: apply default configuration for DSA devices - include_tasks: dsa_default_config.yml + ansible.builtin.include_tasks: dsa_default_config.yml vars: dsa_id: "{{ item.path | basename | replace('dsa', '') }}" with_items: "{{ found_dsa_devices.files }}" @@ -40,13 +40,13 @@ - dsa_devices | default([]) | length | int == 0 - name: fail if configured number of DSA devices is greater than actual number of DSA devices on the node - fail: + ansible.builtin.fail: msg: "Max supported DSA devices by node is {{ found_dsa_devices.matched }}, but configuration for {{ dsa_devices | length }} was provided. Please update dsa_devices list in host_vars." when: dsa_devices | default([]) | length > found_dsa_devices.matched - name: apply custom configuration for DSA devices - include_tasks: dsa_custom_config.yml + ansible.builtin.include_tasks: dsa_custom_config.yml vars: dsa_device: "{{ item }}" loop: "{{ dsa_devices }}" @@ -56,12 +56,13 @@ # config will be saved to /etc/accel-config/accel-config.conf as default. - name: save accel-config configuration - command: accel-config save-config + ansible.builtin.command: accel-config save-config changed_when: true # WA for configuring DSA devices # in some CPU SKUs with specific BIOS version, wq_cap.wq_ats_support is disabled, so wq_ats_disable cannot be written. - name: modify accel-config.conf + when: not is_gnr block: - name: remove ats_disable parameter ansible.builtin.lineinfile: @@ -75,7 +76,7 @@ replace: "\"threshold\":0" - name: create systemd unit file - copy: + ansible.builtin.copy: src: "{{ (role_path , 'files', 'dsa_config.service') | path_join }}" dest: /lib/systemd/system/dsa_config.service owner: root @@ -83,7 +84,7 @@ mode: '0644' - name: ensure that systemd service is enabled - systemd: + ansible.builtin.systemd: name: dsa_config enabled: yes daemon_reload: yes diff --git a/roles/bootstrap/configure_dsa/tasks/wqs_custom_config.yml b/roles/bootstrap/configure_dsa/tasks/wqs_custom_config.yml index 5bf3b55f..2342abb4 100644 --- a/roles/bootstrap/configure_dsa/tasks/wqs_custom_config.yml +++ b/roles/bootstrap/configure_dsa/tasks/wqs_custom_config.yml @@ -16,37 +16,37 @@ --- - name: check WQ size value fail: - msg: "The max size of single WQ is {{ max_single_wq_size }}, but for WQ{{ dsa_id }}.{{ WQ.id }} value {{ WQ.size }} was provided." - when: WQ.size | int > max_single_wq_size | int + msg: "The max size of single WQ is {{ max_single_wq_size }}, but for WQ{{ dsa_id }}.{{ work_queue.id }} value {{ work_queue.size }} was provided." + when: work_queue.size | int > max_single_wq_size | int - name: check WQ threshold value fail: msg: "Wrong threshold value. Possible reasons are: threshold is defined and WQ mode is not shared, threshold is >= WQ size. - Please check these settings for WQ{{ dsa_id }}.{{ WQ.id }}" + Please check these settings for WQ{{ dsa_id }}.{{ work_queue.id }}" when: - - WQ.threshold is defined and WQ.mode == 'dedicated' or WQ.threshold is defined and WQ.threshold >= WQ.size + - work_queue.threshold is defined and work_queue.mode == 'dedicated' or work_queue.threshold is defined and work_queue.threshold >= work_queue.size - name: check WQ priority value fail: - msg: "Valid range for priority is from 1 to 15, but got {{ WQ.prio }} for WQ{{ dsa_id }}.{{ WQ.id }}. Please update the config list." - when: WQ.prio < 1 or WQ.prio > 15 + msg: "Valid range for priority is from 1 to 15, but got {{ work_queue.prio }} for WQ{{ dsa_id }}.{{ work_queue.id }}. Please update the config list." + when: work_queue.prio < 1 or work_queue.prio > 15 - name: check WQ type value fail: - msg: "Valid types are: kernel, user, but '{{ WQ.type }}' provided for WQ{{ dsa_id }}.{{ WQ.id }}. Please update the config list." - when: WQ.type not in ['kernel', 'user'] + msg: "Valid types are: kernel, user, but '{{ work_queue.type }}' provided for WQ{{ dsa_id }}.{{ work_queue.id }}. Please update the config list." + when: work_queue.type not in ['kernel', 'user'] - name: check WQ group id value fail: msg: "Valid group IDs are from 0 to {{ max_groups.stdout | int - 1 }}, but WQ.group_id provided. - Please update config for WQ{{ dsa_id }}.{{ WQ.id }}." - when: WQ.group_id < 0 or WQ.group_id > (max_groups.stdout | int - 1) + Please update config for WQ{{ dsa_id }}.{{ work_queue.id }}." + when: work_queue.group_id < 0 or work_queue.group_id > (max_groups.stdout | int - 1) - name: check WQ block_on_fault value fail: - msg: "block_on_fault should be either 0 or 1, but {{ WQ.block_on_fault }} was provided. - Please update config for WQ{{ dsa_id }}.{{ WQ.id }}." - when: WQ.block_on_fault not in [0, 1] + msg: "block_on_fault should be either 0 or 1, but {{ work_queue.block_on_fault }} was provided. + Please update config for WQ{{ dsa_id }}.{{ work_queue.id }}." + when: work_queue.block_on_fault not in [0, 1] # NOTE(pklimowx): consider unification of wq configuration tasks. For now accel-config tool # will fail when try to write 'shared' into mode param explicitly (all WQs are shared by default). @@ -54,28 +54,30 @@ # (threshold can be set to -1 for Dedicated WQ using python style if-else) - name: configure Dedicated Work Queues command: >- - accel-config config-wq {{ dsa_device.name }}/wq{{ dsa_id }}.{{ WQ.id }} - --group-id={{ WQ.group_id }} - --mode={{ WQ.mode }} - --priority={{ WQ.prio }} - --wq-size={{ WQ.size }} - --max-batch-size={{ WQ.max_batch_size }} - --max-transfer-size={{ WQ.max_transfer_size }} - --block-on-fault={{ WQ.block_on_fault }} - --type={{ WQ.type }} - --name={{ WQ.mode }}-queue-{{ dsa_id }}.{{ WQ.id }} - when: WQ.mode == 'dedicated' + accel-config config-wq {{ dsa_device.name }}/wq{{ dsa_id }}.{{ work_queue.id }} + --group-id={{ work_queue.group_id }} + --mode={{ work_queue.mode }} + --priority={{ work_queue.prio }} + --wq-size={{ work_queue.size }} + --max-batch-size={{ work_queue.max_batch_size }} + --max-transfer-size={{ work_queue.max_transfer_size }} + --block-on-fault={{ work_queue.block_on_fault }} + --type={{ work_queue.type }} + --name={{ work_queue.mode }}-queue-{{ dsa_id }}.{{ work_queue.id }} + changed_when: true + when: work_queue.mode == 'dedicated' - name: configure Shared Work Queues command: >- - accel-config config-wq {{ dsa_device.name }}/wq{{ dsa_id }}.{{ WQ.id }} - --group-id={{ WQ.group_id }} - --threshold={{ WQ.threshold }} - --priority={{ WQ.prio }} - --wq-size={{ WQ.size }} - --max-batch-size={{ WQ.max_batch_size }} - --max-transfer-size={{ WQ.max_transfer_size }} - --block-on-fault={{ WQ.block_on_fault }} - --type={{ WQ.type }} - --name={{ WQ.mode }}-queue-{{ dsa_id }}.{{ WQ.id }} - when: WQ.mode == 'shared' + accel-config config-wq {{ dsa_device.name }}/wq{{ dsa_id }}.{{ work_queue.id }} + --group-id={{ work_queue.group_id }} + --threshold={{ work_queue.threshold }} + --priority={{ work_queue.prio }} + --wq-size={{ work_queue.size }} + --max-batch-size={{ work_queue.k_queue.max_batch_size }} + --max-transfer-size={{ work_queue.max_transfer_size }} + --block-on-fault={{ work_queue.block_on_fault }} + --type={{ work_queue.type }} + --name={{ work_queue.mode }}-queue-{{ dsa_id }}.{{ work_queue.id }} + changed_when: true + when: work_queue.mode == 'shared' diff --git a/roles/bootstrap/configure_kpm_drivers/tasks/configure_drivers.yml b/roles/bootstrap/configure_kpm_drivers/tasks/configure_drivers.yml index 73b10f0e..6d61be39 100644 --- a/roles/bootstrap/configure_kpm_drivers/tasks/configure_drivers.yml +++ b/roles/bootstrap/configure_kpm_drivers/tasks/configure_drivers.yml @@ -22,14 +22,14 @@ state: present persistent: present -- name: acpi_cpufreq scaling driver +- name: intel_cpufreq scaling driver when: - - hostvars[inventory_hostname]['frequency_scaling_driver'] == "acpi_cpufreq" + - hostvars[inventory_hostname]['frequency_scaling_driver'] == "intel_cpufreq" - inventory_hostname in kubernetes_power_manager.power_nodes block: - name: set kernel flags to disable intel_pstate scaling driver ansible.builtin.set_fact: - intel_pstate_cmdline: 'GRUB_CMDLINE_LINUX="${GRUB_CMDLINE_LINUX} intel_pstate=disable intel_pstate=no_hwp"' + intel_pstate_cmdline: 'GRUB_CMDLINE_LINUX="${GRUB_CMDLINE_LINUX} intel_pstate=passive"' - name: disable intel_pstate in /etc/default/grub ansible.builtin.lineinfile: @@ -55,5 +55,5 @@ notify: - reboot server when: - - hostvars[inventory_hostname]['frequency_scaling_driver'] == "acpi_cpufreq" or + - hostvars[inventory_hostname]['frequency_scaling_driver'] == "intel_cpufreq" or hostvars[inventory_hostname]['uncore_frequency']['enabled'] | default(false) | bool diff --git a/roles/bootstrap/configure_openssl/defaults/main.yml b/roles/bootstrap/configure_openssl/defaults/main.yml index 83441d99..f5ca55a0 100644 --- a/roles/bootstrap/configure_openssl/defaults/main.yml +++ b/roles/bootstrap/configure_openssl/defaults/main.yml @@ -15,6 +15,6 @@ ## --- openssl_url: "https://github.com/openssl/openssl.git" -openssl_version: "openssl-3.1.2" +openssl_version: "openssl-3.1.4" openssl_dir: "{{ (project_root_dir, 'openssl') | path_join }}" openssl_pkg_subdir: "{{ openssl_dir }}/{{ openssl_version }}" diff --git a/roles/bootstrap/configure_qat/tasks/bind_qat_vfs.yml b/roles/bootstrap/configure_qat/tasks/bind_qat_vfs.yml index 5543c20f..57950f6a 100644 --- a/roles/bootstrap/configure_qat/tasks/bind_qat_vfs.yml +++ b/roles/bootstrap/configure_qat/tasks/bind_qat_vfs.yml @@ -42,17 +42,35 @@ become: yes # get a list of VFs PCI addresses and save the configuration -- name: attach VFs driver +- name: Attach VFs driver block: - - name: fetch VFs pci addresses for a PF - shell: 'for vf in /sys/bus/pci/devices/{{ item.qat_id }}/virtfn*;do basename $(readlink -f $vf);done | sort' - register: vf_pciids - args: - executable: /bin/bash - changed_when: false + - name: Find VFs pci addresses for PF + ansible.builtin.find: + path: "/sys/bus/pci/devices/{{ item.qat_id }}/" + file_type: "link" + pattern: "virtfn*" + use_regex: true + recurse: false + register: vf_pciids_found - - name: save VF driver binding - lineinfile: + - name: Fetch QAT VFs device IDs + ansible.builtin.stat: + path: "{{ device_path }}" + register: vf_pciids_stats + loop: "{{ vf_pciids_found.files | map(attribute='path') | list | sort }}" + loop_control: + loop_var: device_path + when: + - vf_pciids_found.matched > 0 + + - name: Construct list of vf_pciids + ansible.builtin.set_fact: + vf_pciids: "{{ vf_pciids_stats.results | map(attribute='stat') | map(attribute='lnk_target') | map('basename') | list | sort }}" + when: + - vf_pciids_found.matched > 0 + + - name: Save VF driver binding + ansible.builtin.lineinfile: path: "{{ sriov_config_path }}/cek_qat_vfs_{{ item.qat_id }}" line: "{{ this_item[0] }} {{ this_item[1].value }}" regexp: "^{{ this_item[0] }}" @@ -61,9 +79,8 @@ group: root mode: '0600' become: yes - loop: "{{ vf_pciids.stdout_lines | zip(vfs_acc | dict2items) | list }}" + loop: "{{ vf_pciids | zip(vfs_acc | dict2items) | list }}" loop_control: loop_var: this_item when: - - vf_pciids.stderr|length == 0 - - vf_pciids.stdout_lines|length > 0 + - vf_pciids_found.matched > 0 diff --git a/roles/bootstrap/configure_qat/tasks/create_qat_vfs.yml b/roles/bootstrap/configure_qat/tasks/create_qat_vfs.yml index d28e17eb..205c90a9 100644 --- a/roles/bootstrap/configure_qat/tasks/create_qat_vfs.yml +++ b/roles/bootstrap/configure_qat/tasks/create_qat_vfs.yml @@ -34,11 +34,13 @@ # in case when QAT SR-IOV VFs have been already configured we reset it first to avoid "device or resource busy" error - name: reset QAT SR-IOV Virtual Functions shell: echo 0 > /sys/bus/pci/devices/{{ item.qat_id }}/sriov_numvfs + changed_when: true when: existing_vfs.stdout|int != 0 and existing_vfs.stdout|int != item.qat_sriov_numvfs and (existing_vfs.stdout|int != total_vfs.stdout|int or item.qat_sriov_numvfs|int == 0) - name: enable QAT SR-IOV Virtual Functions shell: echo {{ item.qat_sriov_numvfs }} > /sys/bus/pci/devices/{{ item.qat_id }}/sriov_numvfs + changed_when: true when: existing_vfs.stdout|int != item.qat_sriov_numvfs and (existing_vfs.stdout|int != total_vfs.stdout|int or item.qat_sriov_numvfs|int == 0) diff --git a/roles/bootstrap/configure_security/tasks/fw_debian.yaml b/roles/bootstrap/configure_security/tasks/fw_debian.yml similarity index 94% rename from roles/bootstrap/configure_security/tasks/fw_debian.yaml rename to roles/bootstrap/configure_security/tasks/fw_debian.yml index f36fdbcc..f5007eac 100644 --- a/roles/bootstrap/configure_security/tasks/fw_debian.yaml +++ b/roles/bootstrap/configure_security/tasks/fw_debian.yml @@ -18,6 +18,7 @@ command: ufw allow {{ item }} with_items: "{{ fw_open_ports['controller'] }}" become: yes + changed_when: true when: inventory_hostname in groups['kube_control_plane'] or ( 'vm_host' in groups and inventory_hostname in groups['vm_host']) @@ -25,6 +26,7 @@ command: ufw allow {{ item }} with_items: "{{ adq_open_ports['controller'] }}" become: yes + changed_when: true when: - inventory_hostname in groups['kube_control_plane'] - adq_dp.enabled |d(false) | bool @@ -33,12 +35,14 @@ command: ufw allow {{ item }} with_items: "{{ fw_open_ports['node'] }}" become: yes + changed_when: true when: inventory_hostname in groups['kube_node'] - name: open required ports in the firewall configuration on the worker nodes command: ufw allow {{ item }} with_items: "{{ adq_open_ports['node'] }}" become: yes + changed_when: true when: - inventory_hostname in groups['kube_node'] - adq_dp.enabled |d(false) | bool @@ -59,7 +63,8 @@ mode: 0644 - name: get the default interface's name - shell: set pipefail -o && route | grep default | awk '{print $8}' # interface name is at the very end of line + shell: + cmd: set -o pipefail && route | grep default | awk '{print $8}' # interface name is at the very end of line args: executable: /bin/bash changed_when: false diff --git a/roles/bootstrap/configure_security/tasks/fw_redhat.yaml b/roles/bootstrap/configure_security/tasks/fw_redhat.yml similarity index 95% rename from roles/bootstrap/configure_security/tasks/fw_redhat.yaml rename to roles/bootstrap/configure_security/tasks/fw_redhat.yml index 8316ff7a..1fd1be0e 100644 --- a/roles/bootstrap/configure_security/tasks/fw_redhat.yaml +++ b/roles/bootstrap/configure_security/tasks/fw_redhat.yml @@ -18,6 +18,7 @@ command: firewall-cmd --zone=public --add-port={{ item | regex_replace(':', '-') }} --permanent with_items: "{{ fw_open_ports['controller'] }}" become: yes + changed_when: true when: inventory_hostname in groups['kube_control_plane'] or ( 'vm_host' in groups and inventory_hostname in groups['vm_host']) @@ -25,6 +26,7 @@ command: firewall-cmd --zone=public --add-port={{ item | regex_replace(':', '-') }} --permanent with_items: "{{ adq_open_ports['controller'] }}" become: yes + changed_when: true when: - inventory_hostname in groups['kube_control_plane'] - adq_dp.enabled |d(false) | bool @@ -33,12 +35,14 @@ command: firewall-cmd --zone=public --add-port={{ item | regex_replace(':', '-') }} --permanent with_items: "{{ fw_open_ports['node'] }}" become: yes + changed_when: true when: inventory_hostname in groups['kube_node'] - name: open required ports in the firewall configuration on the node command: firewall-cmd --zone=public --add-port={{ item | regex_replace(':', '-') }} --permanent with_items: "{{ adq_open_ports['node'] }}" become: yes + changed_when: true when: - inventory_hostname in groups['kube_node'] - adq_dp.enabled |d(false) | bool @@ -56,7 +60,7 @@ become: yes - name: get the default interface's name - shell: set pipefail -o && route | grep default | awk '{print $8}' # interface name is at the very end of line + shell: set -o pipefail && route | grep default | awk '{print $8}' # interface name is at the very end of line args: executable: /bin/bash changed_when: false diff --git a/roles/bootstrap/configure_security/tasks/main.yml b/roles/bootstrap/configure_security/tasks/main.yml index 5ca89b23..dcf46bb1 100644 --- a/roles/bootstrap/configure_security/tasks/main.yml +++ b/roles/bootstrap/configure_security/tasks/main.yml @@ -32,11 +32,11 @@ become: yes - name: configure firewall on RedHat distributions - include_tasks: fw_redhat.yaml + include_tasks: fw_redhat.yml when: ansible_os_family == "RedHat" - name: configure firewall on Debian distributions - include_tasks: fw_debian.yaml + include_tasks: fw_debian.yml when: ansible_os_family == "Debian" when: firewall_enabled | default(false) | bool @@ -58,11 +58,12 @@ when: not firewall_enabled | default(false) | bool - name: configure SELinux - include_tasks: selinux.yaml + include_tasks: selinux.yml # fix for file size limit bug on RHEL-based distros, please see https://access.redhat.com/solutions/33993 - name: fix file size limit settings bug shell: sed -i -r 's/^(session\s+required\s+pam_limits.so)/#\1/' /etc/pam.d/* # noqa command-instead-of-module + changed_when: true when: ansible_os_family == "RedHat" - name: increase file size limit settings diff --git a/roles/bootstrap/configure_security/tasks/selinux.yaml b/roles/bootstrap/configure_security/tasks/selinux.yml similarity index 97% rename from roles/bootstrap/configure_security/tasks/selinux.yaml rename to roles/bootstrap/configure_security/tasks/selinux.yml index 22fa50a9..146b27cb 100644 --- a/roles/bootstrap/configure_security/tasks/selinux.yaml +++ b/roles/bootstrap/configure_security/tasks/selinux.yml @@ -31,4 +31,4 @@ - ansible_os_family == "RedHat" - ansible_selinux.status != "disabled" - (selinux_state is defined and selinux_state == "disabled") or - (container_runtime == "docker" and ansible_distribution_version >= '8.4') + (container_runtime == "docker" and ansible_distribution_version is version('8.4', '>=')) diff --git a/roles/bootstrap/configure_sgx/defaults/main.yml b/roles/bootstrap/configure_sgx/defaults/main.yml index 065be77a..a1acbaa9 100644 --- a/roles/bootstrap/configure_sgx/defaults/main.yml +++ b/roles/bootstrap/configure_sgx/defaults/main.yml @@ -15,9 +15,9 @@ ## --- # Intel SGX SDK for Ubuntu -sgx_sdk_version_ubuntu: "sgx_linux_x64_sdk_2.21.100.1.bin" -sgx_sdk_url_ubuntu: "https://download.01.org/intel-sgx/sgx-dcap/1.18/linux/distro/ubuntu22.04-server/{{ sgx_sdk_version_ubuntu }}" -sgx_sdk_checksum_ubuntu: "sha256:53e75ad08baad4f74c9f78e33bff30d3d3518160bf8729a4b213e8514c0fd0ec" +sgx_sdk_version_ubuntu: "sgx_linux_x64_sdk_2.22.100.3.bin" +sgx_sdk_url_ubuntu: "https://download.01.org/intel-sgx/sgx-dcap/1.19/linux/distro/ubuntu22.04-server/{{ sgx_sdk_version_ubuntu }}" +sgx_sdk_checksum_ubuntu: "sha256:941bd4e1c2b7c982688f4e6c6438715b18bf1ae4f2bf3c6c8d420ed792ab79c6" # Intel SGX-SGX Key configuration for Ubuntu >= 18.04.4 sgx_apt_source_list: "intel-sgx" @@ -25,24 +25,24 @@ sgx_apt_repo_url: "https://download.01.org/intel-sgx/sgx_repo/ubuntu" sgx_apt_repo_key: "{{ sgx_apt_repo_url }}/intel-sgx-deb.key" # Intel SGX SDK for RHEL -sgx_sdk_version_rhel: "sgx_linux_x64_sdk_2.21.100.1.bin" -sgx_sdk_url_rhel: "https://download.01.org/intel-sgx/sgx-dcap/1.18/linux/distro/rhel8.6-server/{{ sgx_sdk_version_rhel }}" -sgx_sdk_checksum_rhel: "sha256:2ae85c535118a5ff5c2b39cf5652eabae460b8e8d8094340919737eb31bc4021" +sgx_sdk_version_rhel: "sgx_linux_x64_sdk_2.22.100.3.bin" +sgx_sdk_url_rhel: "https://download.01.org/intel-sgx/sgx-dcap/1.19/linux/distro/rhel9.2-server/{{ sgx_sdk_version_rhel }}" +sgx_sdk_checksum_rhel: "sha256:59221c825dd0208bc2244f7721f6979cef18b4393b148b7fff30c56ca99f8c0e" # Intel SGX RPM local repository for RHEL sgx_rpm_local_repo_version_rhel: "sgx_rpm_local_repo.tgz" -sgx_rpm_local_repo_url_rhel: "https://download.01.org/intel-sgx/sgx-dcap/1.18/linux/distro/rhel8.6-server/{{ sgx_rpm_local_repo_version_rhel }}" -sgx_rpm_local_repo_checksum_rhel: "sha256:57f24d1f25d1ae100a4fcbfbdb7d49ef0a744f2edb69e9bb6fcf6e5667160444" +sgx_rpm_local_repo_url_rhel: "https://download.01.org/intel-sgx/sgx-dcap/1.19/linux/distro/rhel9.2-server/{{ sgx_rpm_local_repo_version_rhel }}" +sgx_rpm_local_repo_checksum_rhel: "sha256:4a87e00ba2b27a806b17d9b8976aa63069079c76682729a57adf036d2ac000b0" sgx_config_dir: "{{ project_root_dir }}" sgx_rpm_directory: "{{ (project_root_dir, 'sgx_rpm_local_repo') | path_join }}" -sgx_pkg_version: "2.21.100.1" -sgx_pkg_dcap_version: "1.18.100.1" +sgx_pkg_version: "2.22.100.3" +sgx_pkg_dcap_version: "1.19.100.3" -protobuf_version: protobuf-3.5.0-13.el8.x86_64.rpm -protobuf_repository: https://dl.rockylinux.org/vault/rocky/8.6/AppStream/x86_64/os/Packages/p -protobuf_library_version: libprotobuf.so.15 +protobuf_version: protobuf-3.14.0-13.el9.x86_64.rpm +protobuf_repository: https://dl.rockylinux.org/pub/rocky/9.3/AppStream/x86_64/os/Packages/p/ +protobuf_library_version: libprotobuf.so.25 protobuf_library_version_long: "{{ protobuf_library_version }}.0.0" protobuf_library_dir: usr/lib64 protobuf_dir: protobuf diff --git a/roles/bootstrap/configure_sgx/tasks/main.yml b/roles/bootstrap/configure_sgx/tasks/main.yml index eec54deb..52cdc0bd 100644 --- a/roles/bootstrap/configure_sgx/tasks/main.yml +++ b/roles/bootstrap/configure_sgx/tasks/main.yml @@ -45,6 +45,47 @@ when: - ansible_os_family == "RedHat" +- name: prepare worker node with sgx enabled + block: + - name: ensure sgx_prv group exists + ansible.builtin.group: + name: sgx_prv + state: present + + - name: add user to sgx_prv group + ansible.builtin.user: + name: "{{ ansible_user_id }}" + groups: sgx_prv + append: yes + + - name: create udev rules + ansible.builtin.blockinfile: + path: /etc/udev/rules.d/93-sgx-provision.rules + create: yes + mode: '0644' + block: | + SUBSYSTEM=="misc",KERNEL=="enclave",MODE="0666" + SUBSYSTEM=="misc",KERNEL=="provision",GROUP="sgx_prv",MODE="0660" + SUBSYSTEM=="sgx",KERNEL=="sgx/enclave",MODE="0666" + SUBSYSTEM=="sgx",KERNEL=="sgx/provision",MODE="0660" + SUBSYSTEM=="misc",KERNEL=="sgx_enclave",MODE="0666",SYMLINK+="sgx/enclave" + SUBSYSTEM=="misc",KERNEL=="sgx_provision",GROUP="sgx_prv",MODE="0660",SYMLINK+="sgx/provision" + + - name: copy configure-sgx-udev.service file + ansible.builtin.copy: + src: configure-sgx-udev.service + dest: /lib/systemd/system/configure-sgx-udev.service + mode: 0755 + + - name: ensure configure-sgx-udev.service started + ansible.builtin.systemd: + state: started + name: configure-sgx-udev + enabled: true + when: + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('21.04', '>=')) + or (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.4', '>=')) + - name: SGX configuration is successful debug: msg: diff --git a/roles/bootstrap/configure_sgx/tasks/rhel.yml b/roles/bootstrap/configure_sgx/tasks/rhel.yml index 7a716e5f..8ab85bcc 100644 --- a/roles/bootstrap/configure_sgx/tasks/rhel.yml +++ b/roles/bootstrap/configure_sgx/tasks/rhel.yml @@ -75,7 +75,9 @@ - name: Setting packages for Rocky / RHEL >= 9.0 for sgx platform block: - name: install software specific to Rocky / RHEL >= 9.0 for sgx platform - shell: "set -o pipefail && rpm --reinstall --nodeps '{{ sgx_rpm_directory }}/{{ item }}-{{ sgx_pkg_version }}-1.el8.x86_64.rpm'" + ansible.builtin.shell: + cmd: "set -o pipefail && rpm --reinstall --nodeps '{{ sgx_rpm_directory }}/{{ item }}-{{ sgx_pkg_version }}-1.el9.x86_64.rpm'" + executable: /bin/bash loop: - libsgx-launch - libsgx-epid @@ -91,7 +93,9 @@ failed_when: false - name: install aesm service - shell: "set -o pipefail && rpm --reinstall --nodeps '{{ sgx_rpm_directory }}/{{ item }}-{{ sgx_pkg_version }}-1.el8.x86_64.rpm'" + ansible.builtin.shell: + cmd: "set -o pipefail && rpm --reinstall --nodeps '{{ sgx_rpm_directory }}/{{ item }}-{{ sgx_pkg_version }}-1.el9.x86_64.rpm'" + executable: /bin/bash loop: - sgx-aesm-service when: "'sgx-aesm-service' not in aesm_package.stdout" @@ -109,7 +113,7 @@ state: directory mode: 0644 - - name: downloading protobuf from rocky 8 repository + - name: downloading protobuf from rocky repository get_url: url: "{{ protobuf_repository }}/{{ protobuf_version }}" dest: "{{ (project_root_dir, protobuf_dir, protobuf_version) | path_join }}" @@ -120,8 +124,9 @@ delay: "{{ retry_delay | default(3) }}" - name: unpack protobuf rpm - shell: 'rpm2cpio {{ protobuf_version }} | cpio -idmv' - args: + ansible.builtin.shell: + cmd: 'set -o pipefail && rpm2cpio {{ protobuf_version }} | cpio -idmv' + executable: /bin/bash chdir: "{{ (project_root_dir, protobuf_dir) | path_join }}" changed_when: true @@ -133,7 +138,7 @@ group: root mode: 0755 dest: "{{ ('/', protobuf_library_dir, protobuf_library_version_long) | path_join }}" - when: ansible_distribution_version >= '9.0' + when: ansible_distribution_version is version('9.0', '>=') - name: install common software for sgx platform package: @@ -165,7 +170,7 @@ - name: wait for aesmd service to start pause: # there is no smart way how to check if systemd service will stay running minutes: 1 - when: ansible_distribution_version >= '9.0' + when: ansible_distribution_version is version('9.0', '>=') - name: get aesmd service facts service_facts: diff --git a/roles/bootstrap/configure_sgx/tasks/ubuntu.yml b/roles/bootstrap/configure_sgx/tasks/ubuntu.yml index aec20d8c..5e46b33e 100644 --- a/roles/bootstrap/configure_sgx/tasks/ubuntu.yml +++ b/roles/bootstrap/configure_sgx/tasks/ubuntu.yml @@ -116,44 +116,3 @@ - debug: var: psw_confirm.stdout_lines when: '"Succeed" in psw_confirm.stdout' - -- name: prepare worker node with sgx enabled - block: - - name: ensure sgx_prv group exists - ansible.builtin.group: - name: sgx_prv - state: present - - - name: add user to sgx_prv group - ansible.builtin.user: - name: "{{ ansible_user_id }}" - groups: sgx_prv - append: yes - - - name: create udev rules - ansible.builtin.blockinfile: - path: /etc/udev/rules.d/10-sgx.rules - create: yes - mode: '0644' - block: | - SUBSYSTEM=="misc",KERNEL=="enclave",MODE="0666" - SUBSYSTEM=="misc",KERNEL=="provision",GROUP="sgx_prv",MODE="0660" - SUBSYSTEM=="sgx",KERNEL=="sgx/enclave",MODE="0666" - SUBSYSTEM=="sgx",KERNEL=="sgx/provision",MODE="0660" - SUBSYSTEM=="misc",KERNEL=="sgx_enclave",MODE="0666",SYMLINK+="sgx/enclave" - SUBSYSTEM=="misc",KERNEL=="sgx_provision",GROUP="sgx_prv",MODE="0660",SYMLINK+="sgx/provision" - - - name: copy configure-sgx-udev.service file - ansible.builtin.copy: - src: configure-sgx-udev.service - dest: /lib/systemd/system/configure-sgx-udev.service - mode: 0755 - - - name: ensure configure-sgx-udev.service started - ansible.builtin.systemd: - state: started - name: configure-sgx-udev - enabled: true - when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '21.04') - or (ansible_os_family == "RedHat" and ansible_distribution_version >= '8.4') diff --git a/roles/bootstrap/configure_sst/tasks/main.yml b/roles/bootstrap/configure_sst/tasks/main.yml index 93eea337..13f33a97 100644 --- a/roles/bootstrap/configure_sst/tasks/main.yml +++ b/roles/bootstrap/configure_sst/tasks/main.yml @@ -26,4 +26,4 @@ include_tasks: clx_setup_sst_bf.yml # for now only sst bf is supported on CLX when: - is_clx and sst_bf_configuration_enabled - - ansible_os_family == "RedHat" and ansible_distribution_version >= "8.3" + - ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=') diff --git a/roles/bootstrap/configure_sst/tasks/sst_bf_cp_tf_pp_setup.yml b/roles/bootstrap/configure_sst/tasks/sst_bf_cp_tf_pp_setup.yml index ea854b40..ba978089 100644 --- a/roles/bootstrap/configure_sst/tasks/sst_bf_cp_tf_pp_setup.yml +++ b/roles/bootstrap/configure_sst/tasks/sst_bf_cp_tf_pp_setup.yml @@ -16,7 +16,7 @@ --- - name: install Intel-Speed-Select-Technology (ISST) tool on Ubuntu include_tasks: ubuntu_install_sst_tool.yml - when: ansible_distribution == 'Ubuntu' and ansible_distribution_version >= '20.04' + when: ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('20.04', '>=') - name: Intel(R)-Speed-Select-Technology (ISST) verification command: "intel-speed-select --info" @@ -34,6 +34,7 @@ - name: SST-BF verification command: "intel-speed-select base-freq enable -a" + changed_when: true register: sst_bf_verify when: - sst_bf_configuration_enabled is defined and sst_bf_configuration_enabled @@ -41,6 +42,7 @@ - name: SST-CP verification command: "intel-speed-select core-power enable -a" + changed_when: true register: sst_cp_verify when: - sst_cp_configuration_enabled is defined and sst_cp_configuration_enabled @@ -48,6 +50,7 @@ - name: SST-TF verification command: "intel-speed-select turbo-freq enable -a" + changed_when: true register: sst_tf_verify when: - sst_tf_configuration_enabled is defined and sst_tf_configuration_enabled @@ -97,7 +100,7 @@ when: - '"get-config-levels:0" in sst_pp_verify.stderr' - (ansible_distribution == "Ubuntu") or - (ansible_os_family == "RedHat" and ansible_distribution_version >= "8.3") + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=')) # Intel(R) SST-PP (feature perf-profile) configuration - name: check if SST-PP dir exists @@ -111,8 +114,8 @@ - sst_pp_configuration_enabled is defined and sst_pp_configuration_enabled - not check_sst_pp_dir.stat.exists - '"Intel(R) SST-PP (feature perf-profile) is supported" in isst_verify.stderr_lines' - - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= "20.04") or - (ansible_os_family == "RedHat" and ansible_distribution_version >= "8.3") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=')) - debug: msg: "Intel(R) SST-PP (perf-profile) is not supported on platform, SST-PP deployment skipped" diff --git a/roles/bootstrap/configure_sst/tasks/sst_pp.yml b/roles/bootstrap/configure_sst/tasks/sst_pp.yml index ebcaa684..a6ca57a6 100644 --- a/roles/bootstrap/configure_sst/tasks/sst_pp.yml +++ b/roles/bootstrap/configure_sst/tasks/sst_pp.yml @@ -141,9 +141,10 @@ - name: check vailable online CPUs values for auto config shell: - cmd: "turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59" + cmd: "set -o pipefail && turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59" args: executable: /bin/bash + changed_when: false register: read_turbostat_output when: '"auto" in sst_tf_online_cpus' @@ -155,11 +156,15 @@ - '"auto" in sst_tf_online_cpus' - name: save turbostat output for auto config to SST-PP dir path - shell: - cmd: "turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59 > sst_pp_turbostat_output_when_auto.txt" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 + | head -n 59 > sst_pp_turbostat_output_when_auto.txt args: executable: /bin/bash chdir: "{{ project_root_dir }}/sst_pp_config" + changed_when: false when: - '"enable" in sst_tf_config' - '"auto" in sst_tf_online_cpus' @@ -174,11 +179,12 @@ # Config SST-TF turbo-freq when all are disabled - name: check available online CPUs values when SST-BF,SST-CP and SST-TF are disabled shell: - cmd: turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59 + cmd: set -o pipefail && turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59 args: executable: /bin/bash chdir: "{{ project_root_dir }}/sst_pp_config" register: turbostat_output_for_disabled + changed_when: false when: - '"disable" in sst_bf_config' - '"disable" in sst_cp_config' @@ -193,8 +199,11 @@ - '"disable" in sst_tf_config' - name: save turbostat output to SST-PP dir when SST-BF,SST-CP and SST-TF are disabled - shell: - cmd: "turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59 > sst_pp_turbostat_output_when_disabled.txt" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + turbostat -c {{ online_cpus_range.stdout }} --show Package,Core,CPU,Bzy_MHz -i 1 + | head -n 59 > sst_pp_turbostat_output_when_disabled.txt args: executable: /bin/bash chdir: "{{ project_root_dir }}/sst_pp_config" diff --git a/roles/bootstrap/configure_sst/tasks/sst_pp_user_defined_setup.yml b/roles/bootstrap/configure_sst/tasks/sst_pp_user_defined_setup.yml index a53a3616..b8d000af 100644 --- a/roles/bootstrap/configure_sst/tasks/sst_pp_user_defined_setup.yml +++ b/roles/bootstrap/configure_sst/tasks/sst_pp_user_defined_setup.yml @@ -47,7 +47,7 @@ - name: set user defined online CPUs values for turbo-freq configuration shell: - cmd: "turbostat -c {{ sst_tf_online_cpus }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59" + cmd: "set -o pipefail && turbostat -c {{ sst_tf_online_cpus }} --show Package,Core,CPU,Bzy_MHz -i 1 | head -n 59" args: executable: /bin/bash register: read_turbostat_values @@ -64,8 +64,11 @@ mode: "u=rwx,g=rx,o=rx" - name: save turbostat output for user defined online CPUs to SST-PP dir path - shell: - cmd: "turbostat -c {{ sst_tf_online_cpus }} --show Package,Core,CPU,Bzy_MHz -i 1 2>&1 | head -n 59 > sst_pp_user_defined_turbostat_output.txt" + ansible.builtin.shell: + cmd: >- + set -o pipefail && + turbostat -c {{ sst_tf_online_cpus }} --show Package,Core,CPU,Bzy_MHz -i 1 2>&1 + | head -n 59 > sst_pp_user_defined_turbostat_output.txt args: executable: /bin/bash chdir: "{{ project_root_dir }}/sst_pp_config" diff --git a/roles/bootstrap/golang_install/defaults/main.yml b/roles/bootstrap/golang_install/defaults/main.yml index 2d27d17b..70f7913a 100644 --- a/roles/bootstrap/golang_install/defaults/main.yml +++ b/roles/bootstrap/golang_install/defaults/main.yml @@ -14,8 +14,8 @@ ## limitations under the License. ## --- -golang_version: "1.21.1" -golang_download_checksum: "sha256:b3075ae1ce5dab85f89bc7905d1632de23ca196bd8336afd93fa97434cfa55ae" +golang_version: "1.21.4" +golang_download_checksum: "sha256:73cac0215254d0c7d1241fa40837851f3b9a8a742d0b54714cbdfb3feaf8f0af" golang_download_url: >- {{ 'https://mirrors.aliyun.com/golang/go' + golang_version + '.linux-amd64.tar.gz' diff --git a/roles/bootstrap/golang_install/tasks/main.yml b/roles/bootstrap/golang_install/tasks/main.yml index ed8360ab..d14137cc 100644 --- a/roles/bootstrap/golang_install/tasks/main.yml +++ b/roles/bootstrap/golang_install/tasks/main.yml @@ -102,30 +102,17 @@ changed_when: false when: (additional_go_version | default('')) | length == 0 -- name: start procedure to install cfssl and cfssljson in required versions +- name: Start procedure to install cfssl and cfssljson block: - - name: check current cfssl version - shell: go version -m $(which cfssl) | grep mod | awk '{print $3}' - changed_when: false - failed_when: false - register: cfssl_current_version + - name: Install cfssl to latest version + ansible.builtin.command: go install github.com/cloudflare/cfssl/cmd/cfssl@latest + register: go_command + changed_when: go_command.stdout | length > 0 - - name: check latest cfssl version - shell: go list -m -versions github.com/cloudflare/cfssl | awk '{print $7}' # $7 should be latest varsion - changed_when: false - failed_when: false - register: cfssl_latest_version - - - name: install cfssl in latest version - command: go install github.com/cloudflare/cfssl/cmd/cfssl@latest - changed_when: true - when: cfssl_current_version.stdout != cfssl_latest_version.stdout - - # NOTE(pklimowx): cfssljson doesn't return useful version information - # we have to lose 1s here - - name: install cfssljson in latest version - command: go install github.com/cloudflare/cfssl/cmd/cfssljson@latest - changed_when: true + - name: Install cfssljson to latest version + ansible.builtin.command: go install github.com/cloudflare/cfssl/cmd/cfssljson@latest + register: go_command + changed_when: go_command.stdout | length > 0 when: - groups['kube_control_plane'] | length > 0 - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/bootstrap/install_npu_driver/tasks/main.yml b/roles/bootstrap/install_npu_driver/tasks/main.yml index 3183e196..43237ded 100644 --- a/roles/bootstrap/install_npu_driver/tasks/main.yml +++ b/roles/bootstrap/install_npu_driver/tasks/main.yml @@ -36,13 +36,16 @@ patterns: "vpu-linux-drivers-ubuntu2204-release*.tar.gz" register: npu_driver_packages -- name: Create NPU driver unarchive path +- name: Clean old NPU driver + include_tasks: cleanup.yml + +- name: Create new NPU driver unarchive path ansible.builtin.file: path: "{{ src_npu_driver_path }}" state: directory mode: '0755' -- name: Unarchive NPU drivers package +- name: Unarchive new NPU driver package ansible.builtin.unarchive: src: "{{ npu_driver_packages.files[0].path }}" dest: "{{ src_npu_driver_path }}" diff --git a/roles/bootstrap/install_packages/tasks/debian.yml b/roles/bootstrap/install_packages/tasks/debian.yml index dc8d3625..d8509eef 100644 --- a/roles/bootstrap/install_packages/tasks/debian.yml +++ b/roles/bootstrap/install_packages/tasks/debian.yml @@ -33,7 +33,7 @@ when: - configure_tdx | default(false) - on_vms | default(false) - - ansible_distribution == "Ubuntu" and ansible_distribution_version == '22.04' + - ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==') - name: disable automatic package updates apt: @@ -63,6 +63,16 @@ mode: 0644 when: ansible_os_family == "Debian" +- name: wait for apt-get exit + ansible.builtin.shell: "set -o pipefail && echo -n $(ps -A | grep apt-get | awk '{print $1}')" # noqa command-instead-of-shell + args: + executable: /bin/bash + register: apt_get_out + delay: 10 + retries: 10 + until: apt_get_out.stdout | length == 0 + changed_when: false + - name: set ubuntu APT Source to tsinghua source set_fact: ubuntu_apt_source: "https://mirrors.aliyun.com/ubuntu/" @@ -117,7 +127,7 @@ filename: devel-kubic-libcontainers-stable when: - '"docker" not in container_runtime' - - ansible_distribution == "Ubuntu" and ansible_distribution_version == '20.04' + - ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==') - name: apt update apt: update_cache=yes @@ -202,7 +212,7 @@ when: - configure_tdx | default(false) - on_vms | default(false) - - ansible_distribution == "Ubuntu" and ansible_distribution_version == '22.04' + - ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==') # hirsute (21.04) package for (image & headers) is 20.04. (Note: ansible_distribution_version will not be returned as the correct version) # Depending on the needs, we can split tasks for future Ubuntu releases if necessary. diff --git a/roles/bootstrap/install_packages/tasks/main.yml b/roles/bootstrap/install_packages/tasks/main.yml index 62fcf10d..4ff3eb78 100644 --- a/roles/bootstrap/install_packages/tasks/main.yml +++ b/roles/bootstrap/install_packages/tasks/main.yml @@ -37,8 +37,6 @@ - jmespath==0.9.5 - ruamel.yaml==0.16.13 - cachetools==4.2.1 - - markupsafe==1.1.1 - - jinja2==2.11.3 - openshift==0.12.1 - kubernetes==12.0.1 - six>=1.15.0 @@ -50,3 +48,11 @@ retries: 5 until: pip_result is succeeded delay: 5 + +- name: install jinja2 on non-localhost machines + pip: + name: + - jinja2==2.11.3 + - markupsafe==1.1.1 + state: present + when: inventory_hostname != 'localhost' diff --git a/roles/bootstrap/install_packages/tasks/rhel.yml b/roles/bootstrap/install_packages/tasks/rhel.yml index e623d997..80a5f278 100644 --- a/roles/bootstrap/install_packages/tasks/rhel.yml +++ b/roles/bootstrap/install_packages/tasks/rhel.yml @@ -17,7 +17,8 @@ - name: verify system subscription status on RHEL command: "subscription-manager list --available --all" register: check_subscription_status - failed_when: false + failed_when: false # TODO remove or set proper condition + changed_when: false when: ansible_distribution == 'RedHat' - debug: @@ -29,6 +30,7 @@ - name: enable powertools repository on Rocky < 9.0 # noqa command-instead-of-module - yum is called intenionallly here command: yum config-manager --set-enabled powertools + changed_when: true when: ansible_distribution == 'Rocky' and ansible_distribution_version < '9.0' - name: enable CodeReady Linux Builder repository on RHEL 8 @@ -43,19 +45,20 @@ rhsm_repository: name: codeready-builder-for-rhel-9-x86_64-rpms when: - - ansible_distribution == "RedHat" and ansible_distribution_version >= "9.0" + - ansible_distribution == "RedHat" and ansible_distribution_version is version('9.0', '>=') - "'This system is not yet registered' not in check_subscription_status.stderr" failed_when: false # allow to fail if o/s is not subscribed but need to warn user # Rocky 9.0 --set-enabled crb is required which is similar to --set-enabled powertools on Rocky <= 9.0 - name: enable CRB to support dependent on packages from repositories command: "dnf config-manager --set-enabled crb -y" - when: ansible_distribution == "Rocky" and ansible_distribution_version >= "9.0" + changed_when: true + when: ansible_distribution == "Rocky" and ansible_distribution_version is version('9.0', '>=') - name: install epel-release on Rocky >= 9.0 package: name: epel-release - when: ansible_distribution == "Rocky" and ansible_distribution_version >= "9.0" + when: ansible_distribution == "Rocky" and ansible_distribution_version is version('9.0', '>=') - name: obtain RPM-GPG-KEY-EPEL-8 rpm_key: @@ -63,7 +66,7 @@ key: "{{ fedora_epel_repo_url }}/RPM-GPG-KEY-EPEL-8" when: - ansible_distribution in ['RedHat', 'Rocky'] - - ansible_distribution_version >= '8' + - ansible_distribution_version is version('8', '>=') - ansible_distribution_version < '9' - name: install RPM-GPG-KEY-EPEL-8 @@ -71,7 +74,7 @@ name: "{{ fedora_epel_repo_url }}/epel-release-latest-8.noarch.rpm" when: - ansible_distribution in ['RedHat', 'Rocky'] - - ansible_distribution_version >= '8' + - ansible_distribution_version is version('8', '>=') - ansible_distribution_version < '9' - name: obtain RPM-GPG-KEY-EPEL-9 @@ -80,14 +83,14 @@ key: "{{ fedora_epel_repo_url }}/RPM-GPG-KEY-EPEL-9" when: - ansible_distribution in ["RedHat", "Rocky"] - - ansible_distribution_version >= "9" + - ansible_distribution_version is version('9', '>=') - name: install RPM-GPG-KEY-EPEL-9 package: name: "{{ fedora_epel_repo_url }}/epel-release-latest-9.noarch.rpm" when: - ansible_distribution in ["RedHat", "Rocky"] - - ansible_distribution_version >= "9" + - ansible_distribution_version is version('9', '>=') # CPUID package is missing on RHEL 9.0 / Rocky 9.0 - name: block for downloading CPUID on RHEL / Rocky >= 9.0 @@ -116,7 +119,7 @@ state: present when: - ansible_distribution in ['RedHat', 'Rocky'] - - ansible_distribution_version >= '9' + - ansible_distribution_version is version('9', '>=') - name: get current kernel version command: uname -r @@ -136,7 +139,7 @@ - "https://dl.rockylinux.org/vault/rocky/8.5/BaseOS/x86_64/kickstart/Packages/k/kernel-devel-4.18.0-348.el8.0.2.x86_64.rpm" when: - ansible_distribution == "Rocky" - - ansible_distribution_version == '8.5' + - ansible_distribution_version is version('8.5', '==') - not update_kernel - name: get Rocky 9 packages for base o/s @@ -171,7 +174,7 @@ - "https://dl.rockylinux.org/{{ rocky_uri }}/rocky/{{ ansible_distribution_version }}/AppStream/x86_64/kickstart/Packages/k/kernel-devel-{{ current_kernel_version.stdout }}.rpm" # noqa yaml[line-length] when: - ansible_distribution == "Rocky" - - ansible_distribution_version >= '9.0' + - ansible_distribution_version is version('9.0', '>=') - not update_kernel - not on_vms | default (false) @@ -188,13 +191,13 @@ - "https://dl.rockylinux.org/pub/rocky/9/AppStream/x86_64/kickstart/Packages/k/kernel-devel-{{ current_kernel_version.stdout }}.rpm" when: - ansible_distribution == "Rocky" - - ansible_distribution_version >= '9.0' + - ansible_distribution_version is version('9.0', '>=') - on_vms | default (false) - name: install epel-next-release on Rocky >= 9.0 package: name: epel-next-release - when: ansible_distribution == "Rocky" and ansible_distribution_version >= "9.0" + when: ansible_distribution == "Rocky" and ansible_distribution_version is version('9.0', '>=') - name: install epel-release on Amazon Linux 2 package: @@ -206,7 +209,7 @@ until: source_status is not failed when: - ansible_distribution == "Amazon" - - ansible_distribution_version == '2' + - ansible_distribution_version is version('2', '==') # pull the matching kernel headers if kernel is not updated - name: pull matching kernel headers from configured repos @@ -238,14 +241,14 @@ name: python3-pip when: - ansible_os_family == "RedHat" - - ansible_distribution_version >= '8' + - ansible_distribution_version is version('8', '>=') - name: install python3-devel package: name: python3-devel when: - ansible_os_family == "RedHat" - - ansible_distribution_version >= '8' + - ansible_distribution_version is version('8', '>=') # no harm to remove package it will be reinstalled / updated during dnf update - name: remove network-scripts package when update packages is required in Rocky / RHEL >= 9.0 @@ -253,7 +256,7 @@ name: network-scripts state: absent when: - - ansible_os_family == "RedHat" and ansible_distribution_version >= "9.0" + - ansible_os_family == "RedHat" and ansible_distribution_version is version('9.0', '>=') - update_all_packages | default(false) - name: update all packages @@ -284,7 +287,7 @@ link: /usr/bin/python when: - ansible_os_family == "RedHat" - - ansible_distribution_version >= '8' + - ansible_distribution_version is version('8', '>=') - name: install command line tools to collect hardware details package: @@ -292,6 +295,7 @@ - inxi - jq - lshw + - vim state: present when: ansible_os_family == "RedHat" @@ -308,5 +312,5 @@ - chrony - iputils when: - - ansible_distribution in ['RedHat', 'Rocky'] and ansible_distribution_version >= '8.4' + - ansible_distribution in ['RedHat', 'Rocky'] and ansible_distribution_version is version('8.4', '>=') - container_runtime == "docker" diff --git a/roles/bootstrap/install_qat_drivers_services/tasks/main.yml b/roles/bootstrap/install_qat_drivers_services/tasks/main.yml index 231f71fd..de55a08f 100644 --- a/roles/bootstrap/install_qat_drivers_services/tasks/main.yml +++ b/roles/bootstrap/install_qat_drivers_services/tasks/main.yml @@ -34,6 +34,26 @@ - on_vms | default(false) - hostvars[hostvars[inventory_hostname]['vm_host']]['qat_oot_driver_build_failed'] | default(false) +- name: WA for QAT OOT driver issue on RHEL/Rocky + when: + - on_vms | default(false) + - hostvars[hostvars[inventory_hostname]['vm_host']]['ansible_os_family'] == "RedHat" + - configured_arch in ['gnr'] + block: + - name: try to use intree qat driver on kube_control_plane + ansible.builtin.set_fact: + qat_oot_driver_build_failed: true + update_qat_drivers: false + with_items: "{{ groups['kube_control_plane'] }}" + delegate_to: "{{ item }}" + delegate_facts: true + run_once: true + + - name: try to use intree qat driver + ansible.builtin.set_fact: + qat_oot_driver_build_failed: true + update_qat_drivers: false + - name: install QAT OOT driver ansible.builtin.include_tasks: qat_oot_driver_install.yml when: diff --git a/roles/bootstrap/install_qat_drivers_services/tasks/qat_oot_driver_install.yml b/roles/bootstrap/install_qat_drivers_services/tasks/qat_oot_driver_install.yml index 4c13284a..fdb1b20b 100644 --- a/roles/bootstrap/install_qat_drivers_services/tasks/qat_oot_driver_install.yml +++ b/roles/bootstrap/install_qat_drivers_services/tasks/qat_oot_driver_install.yml @@ -15,82 +15,90 @@ ## --- - name: install dependencies for QAT - include_role: + ansible.builtin.include_role: name: install_dependencies - name: install libudev-dev package on Ubuntu - apt: + ansible.builtin.apt: name: libudev-dev when: ansible_distribution == "Ubuntu" - name: create directory {{ qat_drivers_dir }} for all QAT dependencies - file: + ansible.builtin.file: path: "{{ qat_drivers_dir }}" state: directory - mode: "u=rwx,g=rx,o=rx" + mode: '0755' - name: block for QAT 1.x block: - name: download QAT drivers package {{ qat_drivers_version }} - get_url: + ansible.builtin.get_url: url: "{{ qat_drivers_download_url }}" checksum: "{{ qat_drivers_pkg_checksum }}" dest: "{{ qat_drivers_dir }}" - mode: 0755 + mode: '0755' register: qat_driver_sw until: qat_driver_sw is not failed retries: 5 - name: unarchive QAT drivers package - unarchive: + ansible.builtin.unarchive: src: "{{ qat_drivers_dir }}/{{ qat_drivers_version }}.tar.gz" dest: "{{ qat_drivers_dir }}" remote_src: yes - mode: 0755 + mode: '0755' when: - - configured_arch not in ["spr", "emr"] + - configured_arch not in ["spr", "emr", "gnr"] - name: block for QAT 2.x block: - name: download QAT drivers package {{ qat_spr_drivers_version }} - get_url: + ansible.builtin.get_url: url: "{{ qat_spr_drivers_download_url }}" checksum: "{{ qat_spr_drivers_pkg_checksum }}" dest: "{{ qat_drivers_dir }}" - mode: 0755 + mode: '0755' register: qat_driver_sw until: qat_driver_sw is not failed retries: 5 - name: unarchive QAT drivers package - unarchive: + ansible.builtin.unarchive: src: "{{ qat_drivers_dir }}/{{ qat_spr_drivers_version }}.tar.gz" dest: "{{ qat_drivers_dir }}" remote_src: yes - mode: 0755 + mode: '0755' when: - configured_arch in ["spr"] -# Due to EMR is not lauched yet, EMR QAT driver temporally copy from ansible host -# When external driver offically support the EMR platform, converge w/ upper task -- name: block for EMR QAT driver package +# When new platforms are not lauched yet, NDA QAT driver is temporally copied from ansible host +# When external driver for new platform is officially available, platform is moved to the upper task +- name: block for NDA QAT driver package block: - - name: copy EMR QAT driver package + - name: switch NDA QAT driver package for Rocky 9.2 + ansible.builtin.set_fact: + nda_qat_driver_package: "{{ nda_qat_driver_package_rocky }}" + nda_qat_driver_pkg_checksum: "{{ nda_qat_driver_pkg_checksum_rocky }}" + when: + - ansible_distribution == "Rocky" and ansible_distribution_version is version('9.2', '==') + - configured_arch in ['gnr'] + + - name: copy NDA QAT driver package ansible.builtin.copy: - src: "{{ (emr_qat_driver_staging_folder, emr_qat_driver_package) | path_join }}" - dest: "{{ (qat_drivers_dir, emr_qat_driver_package) | path_join }}" - mode: 0644 - - name: unarchive EMR QAT driver package + src: "{{ (nda_qat_driver_folder, nda_qat_driver_package) | path_join }}" + dest: "{{ (qat_drivers_dir, nda_qat_driver_package) | path_join }}" + mode: '0644' + - name: unarchive NDA QAT driver package ansible.builtin.unarchive: - src: "{{ (qat_drivers_dir, emr_qat_driver_package) | path_join }}" + src: "{{ (qat_drivers_dir, nda_qat_driver_package) | path_join }}" dest: "{{ qat_drivers_dir }}" remote_src: yes - mode: 0755 + mode: '0755' when: - - configured_arch == "emr" + - configured_arch in ['emr', 'gnr'] - name: check all packages are present for QAT drivers installation - command: ./configure + ansible.builtin.command: ./configure args: chdir: "{{ qat_drivers_dir }}" register: qat_requirements @@ -98,7 +106,7 @@ changed_when: true - name: playbook terminated packages for QAT drivers installation are missing - fail: + ansible.builtin.fail: msg: - "Missing requirements for QAT drivers (i.e. kernel sources)" - "If failure persists, consider setting update_kernel: true in group_vars" @@ -119,12 +127,12 @@ line: "***/" when: - on_vms | default(false) - - configured_arch not in ["spr", "emr"] + - configured_arch not in ["spr", "emr", "gnr"] - name: block for QAT 1.x drivers and samples compilation block: - name: make install QAT drivers - command: "make -j install" + ansible.builtin.command: "make -j install" args: chdir: "{{ qat_drivers_dir }}" become: yes @@ -133,15 +141,15 @@ - reboot server - name: make performance sample application after QAT drivers - make: + community.general.make: chdir: "{{ qat_drivers_dir }}" target: samples-install become: yes when: - - configured_arch not in ["spr", "emr"] + - configured_arch not in ["spr", "emr", "gnr"] - name: set QAT OOT 2.x driver build status - set_fact: + ansible.builtin.set_fact: qat_oot_driver_build_failed: false when: not qat_oot_driver_build_failed | default(false) @@ -149,32 +157,48 @@ - name: block for QAT 2.x drivers and samples compilation block: - name: make install QAT drivers - command: "make -j install" + ansible.builtin.command: "make -j install" args: chdir: "{{ qat_drivers_dir }}" become: yes changed_when: true - name: make performance sample application after QAT drivers - make: + community.general.make: chdir: "{{ qat_drivers_dir }}" target: samples-install become: yes + + - name: create WA to fix QAT driver issue after reboot + block: + - name: add ExecStartPre commands to qat.service + ansible.builtin.lineinfile: + path: "/usr/lib/systemd/system/qat.service" + line: "{{ item }}" + regexp: "^{{ item }}" + insertbefore: "^ExecStart=.*" + with_items: + - "ExecStartPre=sleep 20" + - "ExecStartPre=/etc/init.d/qat_service shutdown" + - name: systemd daemon_reload to take qat.service changes + ansible.builtin.systemd: + daemon_reload: yes + become: yes rescue: - name: QAT 2.x OOT driver build error - debug: + ansible.builtin.debug: msg: "QAT 2.x OOT driver build or installation failed. Rolling back to use inbox driver - functionality might be limited" - name: set QAT OOT 2.x driver build status - set_fact: + ansible.builtin.set_fact: qat_oot_driver_build_failed: true update_qat_drivers: false when: - - configured_arch in ["spr", "emr"] + - configured_arch in ["spr", "emr", "gnr"] - not qat_oot_driver_build_failed | default(false) - name: confirm QAT module installed - shell: "set -o pipefail && lsmod | grep qat" + ansible.builtin.shell: "set -o pipefail && lsmod | grep qat" args: executable: /bin/bash register: qat_confirm @@ -182,7 +206,7 @@ changed_when: false - name: enable SRIOV QAT devices on VMs - lineinfile: + ansible.builtin.lineinfile: path: "/etc/default/qat" line: "SRIOV_ENABLE=1" regexp: "^#SRIOV_ENABLE=1" @@ -194,13 +218,13 @@ - on_vms is defined and on_vms - name: make sure {{ disabled_qat_service }} service is stopped and disabled - service: + ansible.builtin.service: state: stopped name: "{{ disabled_qat_service }}" enabled: no - name: make sure {{ enabled_qat_service }} service is started and enabled - service: + ansible.builtin.service: state: started name: "{{ enabled_qat_service }}" enabled: yes @@ -209,15 +233,15 @@ - name: configuration for QAT Shared Virtual Memory (SVM) block: - name: set QAT SVM is enabled - set_fact: + ansible.builtin.set_fact: svm_value: 1 - name: enable address translation services for QAT Shared Virtual Memory (SVM) - replace: + ansible.builtin.replace: path: "{{ item }}" regexp: '(^SVMEnabled\s)(.*)$' replace: 'SVMEnabled = {{ svm_value }}' - mode: 0600 + mode: '0600' with_items: - "{{ qat_drivers_dir }}/quickassist/utilities/adf_ctl/conf_files/4xxxvf_dev0.conf.vm" - "{{ qat_drivers_dir }}/quickassist/utilities/adf_ctl/conf_files/4xxxvf_dev0.conf.sym.vm" @@ -225,5 +249,5 @@ - "{{ qat_drivers_dir }}/quickassist/utilities/adf_ctl/conf_files/4xxxvf_dev0.conf.asym.vm" - "{{ qat_drivers_dir }}/quickassist/utilities/adf_ctl/conf_files/4xxxvf_dev0.conf.dc.sym.vm" when: - - configured_arch in ["spr", "emr"] + - configured_arch in ["spr", "emr", "gnr"] - enable_qat_svm | default(false) diff --git a/roles/bootstrap/install_qat_drivers_services/vars/main.yml b/roles/bootstrap/install_qat_drivers_services/vars/main.yml index f306c3a1..b8280460 100644 --- a/roles/bootstrap/install_qat_drivers_services/vars/main.yml +++ b/roles/bootstrap/install_qat_drivers_services/vars/main.yml @@ -22,6 +22,7 @@ install_dependencies: - wget - make - yasm + - nasm - libboost-all-dev - libnl-genl-3-dev - zlib1g @@ -41,5 +42,6 @@ install_dependencies: - perl - usbutils - yasm + - nasm - boost-devel - libnl3-devel diff --git a/roles/bootstrap/install_qatlibs/defaults/main.yml b/roles/bootstrap/install_qatlibs/defaults/main.yml index ab3ed57e..ccdc55f0 100644 --- a/roles/bootstrap/install_qatlibs/defaults/main.yml +++ b/roles/bootstrap/install_qatlibs/defaults/main.yml @@ -16,7 +16,7 @@ --- # QATLibs intel_qatlib_download_url: "https://github.com/intel/qatlib.git" -intel_qatlib_download_url_version: "23.08.0" +intel_qatlib_download_url_version: "23.11.0" intel_qatlib_download_url_dir: "{{ (project_root_dir, 'intel_qatlibs') | path_join }}" intel_qat_4xxx_firmware_download_url: https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git/plain/qat_4xxx.bin diff --git a/roles/bootstrap/install_qatlibs/tasks/main.yml b/roles/bootstrap/install_qatlibs/tasks/main.yml index bdcb7e40..2b844a62 100644 --- a/roles/bootstrap/install_qatlibs/tasks/main.yml +++ b/roles/bootstrap/install_qatlibs/tasks/main.yml @@ -108,7 +108,7 @@ # using shell module instead of comand as it was giving aclocal: warning: causing playbook failure - name: run autogen before configure QATLibs - shell: './autogen.sh' # noqa 305 # command-instead-of-shell + ansible.builtin.shell: './autogen.sh' # noqa command-instead-of-shell args: chdir: "{{ intel_qatlib_download_url_dir }}" executable: /bin/bash @@ -130,6 +130,6 @@ command: "ldconfig" changed_when: true when: - - configured_arch in ["spr", "emr"] + - configured_arch in ["spr", "emr", "gnr"] - configure_qat | default(false) | bool - not update_qat_drivers | default(false) | bool diff --git a/roles/bootstrap/install_tdx_drivers/defaults/main.yml b/roles/bootstrap/install_tdx_drivers/defaults/main.yml index 8d95de22..370211b6 100644 --- a/roles/bootstrap/install_tdx_drivers/defaults/main.yml +++ b/roles/bootstrap/install_tdx_drivers/defaults/main.yml @@ -21,5 +21,5 @@ intel_tdx_packages_dir: "{{ (project_root_dir, 'intel-tdx') | path_join }}" tdx_1_0_kernel_version: 5.19.17-mvp29v4+4 tdx_1_0_tag: "2023ww33" -tdx_1_5_kernel_version: 6.2.16-mvp30v3+7 -tdx_1_5_tag: "2023ww27" +tdx_1_5_kernel_version: 6.2.16-v5.0 +tdx_1_5_tag: "2023ww41" diff --git a/roles/bootstrap/install_tdx_drivers/files/rust_setup.sh b/roles/bootstrap/install_tdx_drivers/files/rust_setup.sh new file mode 100644 index 00000000..56a7e5aa --- /dev/null +++ b/roles/bootstrap/install_tdx_drivers/files/rust_setup.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rustup-init.sh +chmod a+x rustup-init.sh +./rustup-init.sh -y --profile minimal --default-toolchain nightly-2023-08-28 + +export PATH="${PATH}":"${HOME}"/.cargo/bin +cargo install cargo-xbuild +rustup component add rust-src diff --git a/roles/bootstrap/install_tdx_drivers/tasks/main.yml b/roles/bootstrap/install_tdx_drivers/tasks/main.yml index 821e552f..8a706efc 100644 --- a/roles/bootstrap/install_tdx_drivers/tasks/main.yml +++ b/roles/bootstrap/install_tdx_drivers/tasks/main.yml @@ -18,4 +18,4 @@ include_tasks: ubuntu.yml when: - ansible_distribution == "Ubuntu" - - ansible_distribution_version == "22.04" + - ansible_distribution_version is version('22.04', '==') diff --git a/roles/bootstrap/install_tdx_drivers/tasks/tdx_preflight.yml b/roles/bootstrap/install_tdx_drivers/tasks/tdx_preflight.yml index cda92a7b..4e6aef21 100644 --- a/roles/bootstrap/install_tdx_drivers/tasks/tdx_preflight.yml +++ b/roles/bootstrap/install_tdx_drivers/tasks/tdx_preflight.yml @@ -30,6 +30,6 @@ - name: check linux distro version for tdx ansible.builtin.assert: that: > - - (ansible_distribution == 'Ubuntu' and ansible_distribution_version == '22.04') + - (ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('22.04', '==')) msg: - "TDX is verified only on Ubuntu 22.04 with RA" diff --git a/roles/bootstrap/install_tdx_drivers/tasks/ubuntu.yml b/roles/bootstrap/install_tdx_drivers/tasks/ubuntu.yml index 89480f53..bee38f08 100644 --- a/roles/bootstrap/install_tdx_drivers/tasks/ubuntu.yml +++ b/roles/bootstrap/install_tdx_drivers/tasks/ubuntu.yml @@ -47,27 +47,27 @@ version: "{{ intel_tdx_tag }}" force: true - - name: block to fix the compilation error for tdx 1.5 - block: - - name: fix the tdx migration module compilation error - ansible.builtin.lineinfile: - path: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04', 'intel-mvp-tdx-migration', 'build.sh') | path_join }}" - insertafter: " ./sh_script/preparation.sh" - line: " cp ./Cargo.lock ./deps/td-shim/" - - name: fix the tdx vtpm module compilation error - ansible.builtin.lineinfile: - path: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04', 'intel-mvp-vtpm-td', 'build.sh') | path_join }}" - insertafter: " source sh_script/pre-build.sh" - line: " cp ../../intel-mvp-tdx-migration/migtd/Cargo.lock ./deps/td-shim/" - when: - - tdx_version == "1.5" - - - name: cleanup the already built libraries and packages - ansible.builtin.command: >- - ./build-repo.sh cleanup + - name: copy rust setup script to target machine + ansible.builtin.copy: + src: rust_setup.sh + dest: "{{ (intel_tdx_download_dir, 'rust_setup.sh') | path_join }}" + mode: '0755' + + - name: run the rust setup script + ansible.builtin.command: ./rust_setup.sh args: - chdir: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04') | path_join }}" + chdir: "{{ intel_tdx_download_dir }}" changed_when: true + become: true + register: rust_setup_register + + - name: print the rust installation log + ansible.builtin.debug: + msg: "{{ rust_setup_register.stdout }}" + + - name: print the prompt information + ansible.builtin.debug: + msg: "start to compile the tdx driver, it will cost a long time(~1h), please wait" - name: compile the tdx source code ansible.builtin.command: >- @@ -75,22 +75,23 @@ args: chdir: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04') | path_join }}" changed_when: true + become: true + environment: + PATH: "{{ ansible_env.HOME }}/.cargo/bin:{{ ansible_env.PATH }}" -- name: copy the comiled host/guest packages to {{ intel_tdx_packages_dir }} - block: - - name: copy host packages - ansible.builtin.copy: - src: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04', 'host_repo') | path_join }}" - dest: "{{ intel_tdx_packages_dir }}" - remote_src: yes - mode: '0644' + - name: generate copy_packages.sh script on target machine + ansible.builtin.template: + src: copy_packages.sh.j2 + dest: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04', 'copy_packages.sh') | path_join }}" + mode: '0755' - - name: sync guest packages - ansible.builtin.copy: - src: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04', 'guest_repo') | path_join }}" - dest: "{{ intel_tdx_packages_dir }}" - remote_src: yes - mode: '0644' + - name: copy + ansible.builtin.command: >- + ./copy_packages.sh + args: + chdir: "{{ (intel_tdx_download_dir, 'build', 'ubuntu-22.04') | path_join }}" + changed_when: true + become: true - name: install the host packages # npm do not use the apt module due to it can not resolve the dependencies for multiple packages diff --git a/roles/bootstrap/install_tdx_drivers/templates/copy_packages.sh.j2 b/roles/bootstrap/install_tdx_drivers/templates/copy_packages.sh.j2 new file mode 100644 index 00000000..93e99b35 --- /dev/null +++ b/roles/bootstrap/install_tdx_drivers/templates/copy_packages.sh.j2 @@ -0,0 +1,29 @@ +#! /bin/bash + +GUEST_DEFAULT_PKG=" \ +linux-image-unsigned-6.2.16-* linux-headers-6.2.16-* linux-modules-6.2.16-* \ +" +HOST_DEFAULT_PKG=" \ +linux-image-unsigned-6.2.16-*.deb linux-headers-6.2.16-* linux-modules-6.2.16-* linux-modules-extra-6.2.16-* \ +qemu-system-x86_7.2*.deb qemu-system-common_7.2*.deb qemu-system-data_7.2*.deb \ +ovmf_*_all.deb \ +libvirt-clients_*.deb libvirt0_*.deb libvirt-daemon_*.deb libvirt-daemon-system_*.deb libvirt-daemon-system-systemd_*.deb\ + libvirt-daemon-driver-qemu_*.deb libvirt-daemon-config-network_*.deb libvirt-daemon-config-nwfilter_*.deb\ + libvirt-login-shell_*.deb libvirt-daemon-driver-lxc_*.deb libvirt-dev_*.deb \ +mig-td_*_amd64.deb \ +vtpm-td_*_amd64.deb \ +" +# copy packages +cp host_repo/jammy/all/*.deb host_repo/jammy/amd64/ +cp guest_repo/jammy/all/*.deb guest_repo/jammy/amd64/ + +mkdir {{ intel_tdx_packages_dir }}/host_repo +pushd host_repo/jammy/amd64/ +cp $HOST_DEFAULT_PKG {{ intel_tdx_packages_dir }}/host_repo +popd + +mkdir {{ intel_tdx_packages_dir }}/guest_repo +pushd guest_repo/jammy/amd64/ +cp $GUEST_DEFAULT_PKG {{ intel_tdx_packages_dir }}/guest_repo +popd + diff --git a/roles/bootstrap/install_tdx_drivers/vars/main.yml b/roles/bootstrap/install_tdx_drivers/vars/main.yml index a420777f..4142a6eb 100644 --- a/roles/bootstrap/install_tdx_drivers/vars/main.yml +++ b/roles/bootstrap/install_tdx_drivers/vars/main.yml @@ -33,3 +33,9 @@ install_dependencies: - cpio - rpm2cpio - python3-dev + - nasm + - llvm + - clang + - ocaml + - ocamlbuild + - mini-dinstall diff --git a/roles/bootstrap/set_calico_vpp_interface_name/tasks/main.yml b/roles/bootstrap/set_calico_vpp_interface_name/tasks/main.yml new file mode 100644 index 00000000..a6b0e0ba --- /dev/null +++ b/roles/bootstrap/set_calico_vpp_interface_name/tasks/main.yml @@ -0,0 +1,50 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: create default network configure file for Rocky and RHEL + block: + - name: create /etc/systemd/network directory + ansible.builtin.file: + path: "/etc/systemd/network" + state: directory + mode: '0755' + - name: create network configure file + ansible.builtin.template: + src: "70-custom-calicovpp.link.j2" + dest: "/etc/systemd/network/70-custom-calicovpp.link" + mode: '0744' + notify: + - reboot server + when: + - ansible_distribution in ['RedHat', 'Rocky'] and ansible_distribution_version is version('9.2', '>=') + +- name: create netplan configure file for calicovpp for Ubuntu + block: + - name: create netplan configure file + ansible.builtin.template: + src: "10-calico-vpp.yaml.j2" + dest: "/etc/netplan/10-calico-vpp.yaml" + mode: '0744' + - name: apply netplan for calicovpp + ansible.builtin.command: sudo netplan apply + changed_when: true + - name: wait 5 seconds for netplan apply + pause: + seconds: 5 + notify: + - reboot server + when: + - ansible_distribution == 'Ubuntu' diff --git a/roles/bootstrap/set_calico_vpp_interface_name/templates/10-calico-vpp.yaml.j2 b/roles/bootstrap/set_calico_vpp_interface_name/templates/10-calico-vpp.yaml.j2 new file mode 100644 index 00000000..6407756e --- /dev/null +++ b/roles/bootstrap/set_calico_vpp_interface_name/templates/10-calico-vpp.yaml.j2 @@ -0,0 +1,9 @@ +network: + ethernets: + {{ calico_vpp_interface_name }}: + dhcp4: true + match: + macaddress: {{ hostvars[inventory_hostname]['ansible_' + calico_vpp_interface_name].macaddress }} + optional: true + set-name: {{ calico_vpp.interface_name }} + version: 2 diff --git a/roles/bootstrap/set_calico_vpp_interface_name/templates/70-custom-calicovpp.link.j2 b/roles/bootstrap/set_calico_vpp_interface_name/templates/70-custom-calicovpp.link.j2 new file mode 100644 index 00000000..36f734a9 --- /dev/null +++ b/roles/bootstrap/set_calico_vpp_interface_name/templates/70-custom-calicovpp.link.j2 @@ -0,0 +1,4 @@ +[Match] +MACAddress={{ hostvars[inventory_hostname]['ansible_' + calico_vpp_interface_name].macaddress }} +[Link] +Name={{ calico_vpp.interface_name }} diff --git a/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/main.yml b/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/main.yml index c292c972..8bc9fd80 100644 --- a/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/main.yml +++ b/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/main.yml @@ -15,169 +15,74 @@ ## --- # probe CPU -- debug: msg="CPU={{ ansible_processor[2] }} cores={{ ansible_processor_cores }} count={{ ansible_processor_count }} nproc={{ ansible_processor_nproc }} tpc={{ ansible_processor_threads_per_core }} vcpus={{ ansible_processor_vcpus }}" # noqa yaml[line-length] +- ansible.builtin.debug: + msg: | + CPU: "{{ ansible_processor[2] }}" + cores: "{{ ansible_processor_cores }}" + count: "{{ ansible_processor_count }}" + nproc: "{{ ansible_processor_nproc }}" + tpc: "{{ ansible_processor_threads_per_core }}" + vcpus: "{{ ansible_processor_vcpus }}" - name: include Intel FlexRAN role vars include_vars: ../../intel_flexran/defaults/main.yml -- name: create Intel FlexRAN files directory on controller - file: - path: "{{ intel_flexran_files_dir }}" - state: directory - mode: '0755' +- name: get isolcpus + ansible.builtin.set_fact: + isol_vars: "{{ lookup('template', './isolcpus.j2') | from_yaml }}" -- name: transfer Intel FlexRAN kernel-cmdline generator to worker - copy: - src: '../../intel_flexran/files/kernel_cmdline_gen.sh' - dest: "{{ intel_flexran_files_dir }}" - mode: '0755' +- ansible.builtin.debug: + msg: | + isolcpus: "{{ isol_vars.isolcpus }}" + housekeeping: "{{ isol_vars.housekeeping }}" + pagesize: "{{ isol_vars.pagesize }}" -- name: generate Intel FlexRAN kernel-cmdline - shell: "./kernel_cmdline_gen.sh" # noqa command-instead-of-shell - args: - chdir: "{{ intel_flexran_files_dir }}" - register: generated_cmdline - changed_when: false - -- debug: msg="{{ generated_cmdline.stdout }}" - -- name: set Intel FlexRAN kernel flags for Host-16c-single - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="{{ generated_cmdline.stdout }}" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "2-15,18-31" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 1 - - ansible_processor_cores == 16 - - intel_flexran_type == "host" - -- name: set Intel FlexRAN kernel flags for Host-20c-single - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="{{ generated_cmdline.stdout }}" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "2-19,22-39" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 1 - - ansible_processor_cores == 20 - - intel_flexran_type == "host" - -# For Host-32c-single, the isol_cores is less than recommended(1-30,33-62), for the purpose of leaving more cpus for common tasks -- name: set Intel FlexRAN kernel flags for Host-32c-single - set_fact: +# Always create full cmdline, and override later if needed +- name: Set Intel FlexRAN kernel flags + ansible.builtin.set_fact: intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="default_hugepagesz=1G hugepages=60 hugepagesz=1G nmi_watchdog=0 softlockup_panic=0 intel_iommu=on iommu=pt - vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 rcu_nocbs=4-31,36-63 irqaffinity=0-3,32-35 isolcpus=managed_irq,domain,4-31,36-63 - kthread_cpus=0-3,32-35 nohz_full=4-31,36-63 crashkernel=auto enforcing=0 quiet rcu_nocb_poll rhgb selinux=0 mce=off audit=0 - pci=realloc pci=assign-busses rdt=l3cat skew_tick=1 nosoftlockup nohz=on" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "4-31,36-63" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 1 - - ansible_processor_cores == 32 - - intel_flexran_type == "host" - -# For Host-32c-dual, the isol_cores is less than recommended(1-62,65-126), for the purpose of leaving more cpus for common tasks -- name: set Intel FlexRAN kernel flags for Host-32c-dual - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="default_hugepagesz=1G hugepages=60 hugepagesz=1G nmi_watchdog=0 softlockup_panic=0 intel_iommu=on iommu=pt - vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 rcu_nocbs=4-59,68-123 irqaffinity=0-3,60-63,64-67,124-127 - isolcpus=managed_irq,domain,4-59,68-123 kthread_cpus=0-3,60-63,64-67,124-127 nohz_full=4-59,68-123 crashkernel=auto enforcing=0 - quiet rcu_nocb_poll rhgb selinux=0 mce=off audit=0 pci=realloc pci=assign-busses rdt=l3cat skew_tick=1 nosoftlockup nohz=on" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "4-59,68-123" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 2 - - ansible_processor_cores == 32 - - intel_flexran_type == "host" - -# For Host-52c-dual, the isol_cores is less than recommended, for the purpose of leaving more cpus for common tasks -- name: set Intel FlexRAN kernel flags for Host-52c-dual - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="default_hugepagesz=1G hugepages=60 hugepagesz=1G nmi_watchdog=0 softlockup_panic=0 intel_iommu=on iommu=pt - vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 rcu_nocbs=4-99,108-203 irqaffinity=0-3,100-103,104-107,204-207 - isolcpus=managed_irq,domain,4-99,108-203 kthread_cpus=0-3,100-103,104-107,204-207 nohz_full=4-99,108-203 crashkernel=auto enforcing=0 - quiet rcu_nocb_poll rhgb selinux=0 mce=off audit=0 pci=realloc pci=assign-busses rdt=l3cat skew_tick=1 nosoftlockup nohz=on" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "4-99,108-203" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 2 - - ansible_processor_cores == 52 - - intel_flexran_type == "host" - -# For Host-56c-dual, the isol_cores is less than recommended, for the purpose of leaving more cpus for common tasks -- name: set Intel FlexRAN kernel flags for Host-56c-dual - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="default_hugepagesz=1G hugepages=60 hugepagesz=1G nmi_watchdog=0 softlockup_panic=0 intel_iommu=on iommu=pt - vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 rcu_nocbs=4-107,116-219 irqaffinity=0-3,108-111,112-115,220-223 - isolcpus=managed_irq,domain,4-107,116-219 kthread_cpus=0-3,108-111,112-115,220-223 nohz_full=4-107,116-219 crashkernel=auto enforcing=0 - quiet rcu_nocb_poll rhgb selinux=0 mce=off audit=0 pci=realloc pci=assign-busses rdt=l3cat skew_tick=1 nosoftlockup nohz=on" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "4-107,116-219" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 2 - - ansible_processor_cores == 56 - - intel_flexran_type == "host" + GRUB_CMDLINE_LINUX="default_hugepagesz=1G hugepages={{ isol_vars.pagesize }} hugepagesz=1G + nmi_watchdog=0 softlockup_panic=0 intel_iommu=on iommu=pt vfio_pci.enable_sriov=1 + vfio_pci.disable_idle_d3=1 rcu_nocbs={{ isol_vars.isolcpus }} irqaffinity={{ isol_vars.housekeeping }} + isolcpus=managed_irq,domain,{{ isol_vars.isolcpus }} kthread_cpus={{ isol_vars.housekeeping }} + nohz_full={{ isol_vars.isolcpus }} crashkernel=auto enforcing=0 quiet rcu_nocb_poll rhgb selinux=0 mce=off + audit=0 pci=realloc pci=assign-busses rdt=l3cat skew_tick=1 nosoftlockup nohz=on" {{ intel_flexran_marker }} # for 5.15.0-1019RT and later, cgroup_disable=memory is no longer neeeded. # RKE2 can be not installed with cgroup_disable=memory, so use 5.15.0-1019RT and later for FlexRAN deployment on RKE2. # The isol_cores is different with written in wiki, as we think there is a mistake. -- name: >- - set Intel FlexRAN kernel flags for Docker POD on Host-32c-single (6338N CPU) on ICX when kernel version is 5.15.0-1019RT and later. - See wiki: https://hub.docker.com/r/intel/flexran_vdu - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="{{ generated_cmdline.stdout }}" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "2-31,34-63" - intel_flexran_cpu_supported: true - when: - - ansible_processor_count == 1 - - ansible_processor_cores == 32 - - intel_flexran_type == "pod" - - ansible_kernel >= "5.15.0-1019-realtime" - - configured_arch == "icx" -- name: set Intel FlexRAN Real-Time profile - include_tasks: realtime_profile.yml - when: - - ansible_processor_count == 1 - - ansible_processor_cores == 32 - - intel_flexran_type == "pod" - - ansible_kernel >= "5.15.0-1030-realtime" - - configured_arch == "spr" - -- name: >- - set Intel FlexRAN kernel flags for Docker POD on SPR-EE MCC when kernel version is 5.15.0-1030RT and later. - See wiki: https://hub.docker.com/r/intel/flexran_l1_spree - set_fact: - intel_flexran_cmdline: >- - GRUB_CMDLINE_LINUX="intel_iommu=on iommu=pt vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 usbcore.autosuspend=-1 selinux=0 - enforcing=0 nmi_watchdog=0 crashkernel=auto softlockup_panic=0 audit=0 cgroup_enable=memory mce=off hugepagesz=1G hugepages=60 - hugepagesz=2M hugepages=0 default_hugepagesz=1G kthread_cpus=0,31,32,63 irqaffinity=0,31,32,63" {{ intel_flexran_marker }} - intel_flexran_isol_cores: "1-30,33-62" - intel_flexran_cpu_supported: true +- name: Set Intel FlexRAN kernel flags and Real-Time profile for Docker Pod on SPR with kernel 5.15.0-1030RT and later + block: + - name: set Intel FlexRAN Real-Time profile + include_tasks: realtime_profile.yml + + - name: >- + set Intel FlexRAN kernel flags for Docker POD on SPR-EE MCC when kernel version is 5.15.0-1030RT and later. + See wiki: https://hub.docker.com/r/intel/flexran_l1_spree + ansible.builtin.set_fact: + intel_flexran_cmdline: >- + GRUB_CMDLINE_LINUX="intel_iommu=on iommu=pt vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 + usbcore.autosuspend=-1 selinux=0 enforcing=0 nmi_watchdog=0 crashkernel=auto softlockup_panic=0 + audit=0 cgroup_enable=memory mce=off hugepagesz=1G hugepages={{ isol_vars.pagesize }} default_hugepagesz=1G + kthread_cpus={{ isol_vars.housekeeping }} irqaffinity={{ isol_vars.housekeeping }}" {{ intel_flexran_marker }} when: - - ansible_processor_count == 1 - - ansible_processor_cores == 32 - intel_flexran_type == "pod" - ansible_kernel >= "5.15.0-1030-realtime" - configured_arch == "spr" # This is for DSA when we enable it with FlexRAN at the same time - name: add sm_on in iommu to be compatible with DSA requirements - set_fact: + ansible.builtin.set_fact: intel_flexran_cmdline: "{{ intel_flexran_cmdline | replace('intel_iommu=on', 'intel_iommu=on,sm_on') }}" when: - configure_dsa_devices is defined and configure_dsa_devices -- debug: msg="final kernel cmdline is {{ intel_flexran_cmdline }}" +- ansible.builtin.debug: + msg: "final kernel cmdline is {{ intel_flexran_cmdline }}" - name: set Intel FlexRAN kernel flags in /etc/default/grub - lineinfile: + ansible.builtin.lineinfile: dest: /etc/default/grub regexp: '^GRUB_CMDLINE_LINUX="\${GRUB_CMDLINE_LINUX}(.*?)" {{ intel_flexran_marker }}$' line: '{{ intel_flexran_cmdline }}' diff --git a/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/realtime_profile.yml b/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/realtime_profile.yml index dd625e58..c2b4735e 100644 --- a/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/realtime_profile.yml +++ b/roles/bootstrap/set_intel_flexran_kernel_flags/tasks/realtime_profile.yml @@ -31,7 +31,7 @@ - name: edit isolated_cores in /etc/tuned/realtime-variables.conf ansible.builtin.lineinfile: path: /etc/tuned/realtime-variables.conf - line: isolated_cores=1-30,33-62 + line: "isolated_cores={{ isol_vars.isolcpus }}" - name: uncomment isolate_managed_irq=Y in /etc/tuned/realtime-variables.conf ansible.builtin.lineinfile: path: /etc/tuned/realtime-variables.conf diff --git a/roles/bootstrap/set_intel_flexran_kernel_flags/templates/isolcpus.j2 b/roles/bootstrap/set_intel_flexran_kernel_flags/templates/isolcpus.j2 new file mode 100644 index 00000000..5fe009f7 --- /dev/null +++ b/roles/bootstrap/set_intel_flexran_kernel_flags/templates/isolcpus.j2 @@ -0,0 +1,30 @@ +{% set sockets = ansible_processor_count %} +{% set cores = ansible_processor_cores %} +{% set tpc = ansible_processor_threads_per_core %} + +{% if cores <= 32 and not ( cores == 32 and sockets == 2 ) %} +# Assign 2 cores to housekeeping + +{% if tpc == 1 %} +isolcpus: "1-{{ (cores * sockets) - 2 }}" +housekeeping: "0,{{ (cores * sockets) - 1 }}" + +{% else %} +isolcpus: "1-{{ (cores * sockets) - 2 }},{{ (cores * sockets) + 1 }}-{{ (cores * sockets * 2) - 2 }}" +housekeeping: "0,{{ (cores * sockets) - 1 }}-{{ cores * sockets }},{{ (cores * sockets * 2) - 1 }}" + +{% endif %} +{% else %} +# Assign 4 cores to housekeeping + +{% if tpc == 1 %} +isolcpus: "2-{{ (cores * sockets) - 3 }}" +housekeeping: "0-1,{{ (cores * sockets) - 2 }}-{{ (cores * sockets) - 1 }}" + +{% else %} +isolcpus: "2-{{ (cores * sockets) - 3 }},{{ (cores * sockets) + 2 }}-{{ (cores * sockets * 2) - 3 }}" +housekeeping: "0-1,{{ (cores * sockets) - 2 }}-{{ (cores * sockets) + 1 }},{{ (cores * sockets * 2) - 2 }}-{{ (cores * sockets * 2) - 1 }}" + +{% endif %} +{% endif %} +pagesize: {% if cores < 32 %}"40"{% else %}"60"{% endif %} diff --git a/roles/bootstrap/set_sriov_kernel_flags/tasks/setup_sriov_kernel_flags.yml b/roles/bootstrap/set_sriov_kernel_flags/tasks/setup_sriov_kernel_flags.yml index 1c4b2e2b..f2658354 100644 --- a/roles/bootstrap/set_sriov_kernel_flags/tasks/setup_sriov_kernel_flags.yml +++ b/roles/bootstrap/set_sriov_kernel_flags/tasks/setup_sriov_kernel_flags.yml @@ -24,9 +24,9 @@ when: - qat_devices is defined and (qat_devices|length>0) - install_dpdk | default(false) - - (ansible_distribution == "Ubuntu" and ansible_distribution_version == "20.04" and update_kernel) or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= "21.04") or - (ansible_os_family == "RedHat" and ansible_distribution_version >= "8.4") + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==') and update_kernel) or + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('21.04', '>=')) or + (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.4', '>=')) - name: set iommu default kernel flags set_fact: diff --git a/roles/bootstrap/update_grub/tasks/main.yml b/roles/bootstrap/update_grub/tasks/main.yml index 4f9392ba..ad9d0fbe 100644 --- a/roles/bootstrap/update_grub/tasks/main.yml +++ b/roles/bootstrap/update_grub/tasks/main.yml @@ -16,6 +16,8 @@ --- - name: update grub.cfg command: update-grub + notify: + - reboot server when: ansible_os_family == "Debian" changed_when: true @@ -28,6 +30,7 @@ - name: update MBR grub2.cfg command: grub2-mkconfig -o /etc/grub2.cfg + changed_when: true when: grub_mbr_cfg.stat.exists - name: check if grub2-efi.cfg config file exists (EFI variant) @@ -37,6 +40,8 @@ - name: update MBR grub2-efi.cfg on Rocky / RHEL < 9.0 command: "grub2-mkconfig -o /etc/grub2-efi.cfg" + notify: + - reboot server when: grub_efi_cfg.stat.exists changed_when: true when: @@ -52,40 +57,10 @@ - name: update MBR grub.cfg on (Rocky / RHEL >= 9.0) command: "grub2-mkconfig -o /boot/efi/EFI/{{ ansible_distribution | lower }}/grub.cfg" + notify: + - reboot server when: grub_rhel_rocky_cfg.stat.exists changed_when: true when: - ansible_os_family == "RedHat" - - ansible_distribution_version >= "9.0" - -- name: create empty machine_id list from the worker nodes - set_fact: - machine_id_list: [] - -- name: collect machine_id list for the worker nodes - set_fact: - machine_id_list: "{{ machine_id_list | default([]) + [hostvars[item]['ansible_machine_id'] | default([])] }}" - with_items: - - "{{ groups['kube_node'] }}" - -- block: - - name: detect that machine-id duplicates over multiple nodes - debug: - msg: "Detected there are /etc/machine-id duplicates {{ machine_id_list }}, will generate a new machine-id for groups['kube_node'] nodes" - - - name: remove /etc/machine-id - file: - state: absent - path: /etc/machine-id - force: yes - - - name: create new /etc/machine-id (debian) - command: dbus-uuidgen --ensure=/etc/machine-id - changed_when: true - when: ansible_os_family == "Debian" - - - name: create new /etc/machine-id (redhat) - command: systemd-machine-id-setup - changed_when: true - when: ansible_os_family == "RedHat" - when: ( machine_id_list | unique | length < groups['kube_node'] | length ) + - ansible_distribution_version is version('9.0', '>=') diff --git a/roles/bootstrap/update_machine_id/tasks/main.yml b/roles/bootstrap/update_machine_id/tasks/main.yml new file mode 100644 index 00000000..427abf8d --- /dev/null +++ b/roles/bootstrap/update_machine_id/tasks/main.yml @@ -0,0 +1,51 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: create empty machine_id list for the cluster nodes + set_fact: + machine_id_list: [] + +- name: collect machine_id list from the cluster nodes + set_fact: + machine_id_list: "{{ machine_id_list | default([]) + [hostvars[item]['ansible_machine_id'] | default([])] }}" + with_items: + - "{{ groups['k8s_cluster'] }}" + +- block: + - name: detect that machine-id duplicates over multiple nodes + debug: + msg: "Detected there are /etc/machine-id duplicates {{ machine_id_list }}, will generate a new machine-id for groups['k8s_cluster'] nodes" + + - name: remove /etc/machine-id + file: + state: absent + path: /etc/machine-id + force: yes + + - name: create new /etc/machine-id (debian) + command: dbus-uuidgen --ensure=/etc/machine-id + changed_when: true + notify: + - reboot server + when: ansible_os_family == "Debian" + + - name: create new /etc/machine-id (redhat) + command: systemd-machine-id-setup + changed_when: true + notify: + - reboot server + when: ansible_os_family == "RedHat" + when: ( machine_id_list | unique | length < groups['k8s_cluster'] | length ) diff --git a/roles/bootstrap/update_nic_drivers/tasks/i40e.yml b/roles/bootstrap/update_nic_drivers/tasks/i40e.yml index c7325332..e9009c65 100644 --- a/roles/bootstrap/update_nic_drivers/tasks/i40e.yml +++ b/roles/bootstrap/update_nic_drivers/tasks/i40e.yml @@ -86,8 +86,8 @@ when: - not i40e_driver_build_failed - (i40e_installed_version.stdout != i40e_driver_version and mgmt_interface_driver.stdout == i40e_driver_name) or - (i40e_installed_version.stdout != i40e_driver_version and ((ansible_os_family == "RedHat" and ansible_distribution_version >= "9.0") or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= "22.04") or update_kernel)) + (i40e_installed_version.stdout != i40e_driver_version and ((ansible_os_family == "RedHat" and ansible_distribution_version is version('9.0', '>=')) or + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '>=')) or update_kernel)) - name: ensure that i40e module is loaded modprobe: diff --git a/roles/bootstrap/update_nic_drivers/tasks/ice.yml b/roles/bootstrap/update_nic_drivers/tasks/ice.yml index 92744580..bb77f494 100644 --- a/roles/bootstrap/update_nic_drivers/tasks/ice.yml +++ b/roles/bootstrap/update_nic_drivers/tasks/ice.yml @@ -111,8 +111,8 @@ when: - not ice_driver_build_failed - (ice_installed_version.stdout != ice_driver_version and mgmt_interface_driver.stdout == ice_driver_name) or - (ice_installed_version.stdout != ice_driver_version and ((ansible_os_family == "RedHat" and ansible_distribution_version >= "9.0") or - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= "22.04") or update_kernel)) + (ice_installed_version.stdout != ice_driver_version and ((ansible_os_family == "RedHat" and ansible_distribution_version is version('9.0', '>=')) or + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '>=')) or update_kernel)) - name: ensure that ice module is loaded modprobe: diff --git a/roles/bootstrap/update_nic_firmware/tasks/main.yml b/roles/bootstrap/update_nic_firmware/tasks/main.yml index 1551cd65..1eba43df 100644 --- a/roles/bootstrap/update_nic_firmware/tasks/main.yml +++ b/roles/bootstrap/update_nic_firmware/tasks/main.yml @@ -21,7 +21,7 @@ - name: validate host vars interface names with system interface names before FW update assert: - that: "item.name in check_nics_in_system.stdout" + that: item.name in check_nics_in_system.stdout fail_msg: - "Interface name (bus_id) defined in host vars (dataplane interfaces) does not match interface name (bus_id) found in system." - "Please select correct interface name (bus_id) in (dataplane interfaces)" diff --git a/roles/bootstrap/update_nic_firmware/tasks/update.yml b/roles/bootstrap/update_nic_firmware/tasks/update.yml index 933e8bf6..cbed3f9b 100644 --- a/roles/bootstrap/update_nic_firmware/tasks/update.yml +++ b/roles/bootstrap/update_nic_firmware/tasks/update.yml @@ -203,6 +203,7 @@ executable: /bin/bash register: nvmupdate_result when: continue_nvmupdate | bool + changed_when: true failed_when: false - name: show additional message on unsupported platforms diff --git a/roles/cadvisor_install/tasks/cleanup.yml b/roles/cadvisor_install/tasks/cleanup.yml index ab2b2eae..64121977 100644 --- a/roles/cadvisor_install/tasks/cleanup.yml +++ b/roles/cadvisor_install/tasks/cleanup.yml @@ -24,11 +24,13 @@ chdir: "{{ (project_root_dir, 'cadvisor', 'deploy', 'kubernetes', 'overlays', 'cek') | path_join }}" register: kustomize changed_when: true + failed_when: false - name: delete k8s resources kubernetes.core.k8s: definition: "{{ kustomize.stdout }}" state: absent + failed_when: false tags: - cadvisor @@ -36,3 +38,4 @@ ansible.builtin.file: path: "{{ (project_root_dir, 'cadvisor') | path_join }}" state: absent + failed_when: false diff --git a/roles/calico_vpp_install/defaults/main.yml b/roles/calico_vpp_install/defaults/main.yml index 14660420..7aff98de 100644 --- a/roles/calico_vpp_install/defaults/main.yml +++ b/roles/calico_vpp_install/defaults/main.yml @@ -16,10 +16,10 @@ --- calico_vpp_files_dir: "{{ (project_root_dir, 'calico-vpp-files') | path_join }}" -k8s_calico_vpp_version: "v3.25.1" +k8s_calico_vpp_version: "v3.26.0" # install operator to manage the installation, upgrade, and general lifecycle of a Calico cluster k8s_calico_tigera_operator: "https://raw.githubusercontent.com/projectcalico/calico/{{ k8s_calico_vpp_version }}/manifests/tigera-operator.yaml" -k8s_calico_tigera_operator_checksum: "sha256:606a55409e98ede9aa13864348ab90093539eea61dafca9c92f040980ba07b58" +k8s_calico_tigera_operator_checksum: "sha256:9d7513a65ebc5ba9b6bdf8f43060ccd23eb6751eb843ac158cff935181837cc4" # custom install resources to configure Calico k8s_calico_custom_resources: "https://raw.githubusercontent.com/projectcalico/vpp-dataplane/{{ k8s_calico_vpp_version }}/yaml/calico/installation-default.yaml" diff --git a/roles/calico_vpp_install/tasks/calico_vpp_preflight.yml b/roles/calico_vpp_install/tasks/calico_vpp_preflight.yml index aaa605e1..42e99cb4 100644 --- a/roles/calico_vpp_install/tasks/calico_vpp_preflight.yml +++ b/roles/calico_vpp_install/tasks/calico_vpp_preflight.yml @@ -22,6 +22,7 @@ - not kube_network_plugin_multus - hugepages_enabled | default(false) - number_of_hugepages_1G >= 16 + - install_dpdk | default(false) fail_msg: | Make sure that following variables are set correctly: - kube_network_plugin: cni @@ -29,10 +30,14 @@ - kube_network_plugin_multus: false - hugepages_enabled: true - number_of_hugepages_1G: 16 (at least 16) + - install_dpdk: true success_msg: "Required Calico VPP Dataplane variables are set correctly" - ansible.builtin.debug: msg="the given IP in inventory.ini is {{ ip }}" +- name: do facts module to get latest information + ansible.builtin.setup: + - name: parse interfaces from given IP set_fact: calico_vpp_interface_name: "{{ item }}" diff --git a/roles/calico_vpp_install/tasks/main.yml b/roles/calico_vpp_install/tasks/main.yml index 2681bc1b..d3077c92 100644 --- a/roles/calico_vpp_install/tasks/main.yml +++ b/roles/calico_vpp_install/tasks/main.yml @@ -22,6 +22,7 @@ path: "{{ calico_vpp_files_dir }}" state: directory mode: '0755' + when: inventory_hostname == groups['kube_control_plane'][0] - name: download tigera-operator deployment file ansible.builtin.get_url: @@ -32,11 +33,20 @@ register: tigera_operator_download until: tigera_operator_download is not failed retries: 5 + when: inventory_hostname == groups['kube_control_plane'][0] + +- name: Update tigera-operator deployment file + ansible.builtin.lineinfile: + path: "{{ (calico_vpp_files_dir, 'tigera-operator.yaml') | path_join }}" + insertafter: 'kubernetes.io/os: linux' + line: ' node-role.kubernetes.io/control-plane: ""' + when: inventory_hostname == groups['kube_control_plane'][0] - name: install tigera-operator kubernetes.core.k8s: state: present src: "{{ (calico_vpp_files_dir, 'tigera-operator.yaml') | path_join }}" + when: inventory_hostname == groups['kube_control_plane'][0] - name: wait for tigera-operator ready kubernetes.core.k8s_info: @@ -48,6 +58,7 @@ reason: NewReplicaSetAvailable type: Progressing wait_timeout: 240 + when: inventory_hostname == groups['kube_control_plane'][0] - name: generate calico and calico-vpp deployment file ansible.builtin.template: @@ -58,6 +69,7 @@ loop: - {src: 'calico.yaml.j2', dst: 'calico.yaml'} - {src: 'calico-vpp.yaml.j2', dst: 'calico-vpp.yaml'} + when: inventory_hostname == groups['kube_control_plane'][0] # Pause 15 secs for waiting the calico basic part start firslty, and then calico vpp - name: deploy calico @@ -69,11 +81,13 @@ - 'calico-vpp.yaml' loop_control: pause: 15 + when: inventory_hostname == groups['kube_control_plane'][0] # API Server will be in shortly disconnected status during calico cni initialization, so wait for a while - name: wait for calico cni to be fully initialized pause: - seconds: 180 + seconds: 300 + when: inventory_hostname == groups['kube_control_plane'][0] - name: wait for calico-vpp to be ready kubernetes.core.k8s_info: @@ -89,6 +103,7 @@ ds_status.resources | length > 0 and ds_status.resources[0].status.numberReady > 0 ) + when: inventory_hostname == groups['kube_control_plane'][0] - name: wait for calico to be ready block: @@ -123,6 +138,7 @@ ds_status.resources | length > 0 and ds_status.resources[0].status.numberReady > 0 ) + when: inventory_hostname == groups['kube_control_plane'][0] - name: restart kubelet service ansible.builtin.systemd: @@ -133,7 +149,9 @@ - name: install calicoctl ansible.builtin.include_tasks: file: calicoctl.yml + when: inventory_hostname == groups['kube_control_plane'][0] - name: install calivppctl ansible.builtin.include_tasks: file: calivppctl.yml + when: inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/check_machine_type/tasks/check_machine_type.yml b/roles/check_machine_type/tasks/check_machine_type.yml index 57b56ccf..2b1ef779 100644 --- a/roles/check_machine_type/tasks/check_machine_type.yml +++ b/roles/check_machine_type/tasks/check_machine_type.yml @@ -18,15 +18,20 @@ ansible.builtin.set_fact: cpu_id: "{{ ansible_processor[2] | regex_search('\\$?\\d\\d\\d\\d\\%?\\@?\\w?|\\d\\d/\\d\\w') }}" # noqa jinja[spacing] +- name: add dummy CPU ID for GNR in preflight + ansible.builtin.set_fact: + cpu_id: "9999" + when: cpu_id | length == 0 + - name: print CPU ID ansible.builtin.debug: msg: "CPU ID: {{ cpu_id }}" - name: check if CPU has confirmed support ansible.builtin.assert: - that: "cpu_id in {{ lookup('ansible.builtin.vars', 'confirmed_' + configured_arch + '_cpus') }} \ - {% if configured_arch == 'clx' %} or cpu_id in {{ confirmed_clx_ncpus }} {% endif %} \ - or cpu_id in {{ unconfirmed_cpu_models }}" + that: cpu_id in lookup('ansible.builtin.vars', 'confirmed_' + configured_arch + '_cpus') + {% if configured_arch == 'clx' %} or cpu_id in confirmed_clx_ncpus {% endif %} + or cpu_id in unconfirmed_cpu_models fail_msg: "CPU model '{{ cpu_id }}' present on target is not in the confirmed CPUs list.\n To proceed, please add '{{ cpu_id }}' to the list of unconfirmed CPUs in variable 'unconfirmed_cpu_models' in group_vars.\n @@ -34,13 +39,15 @@ when: - configured_arch not in ['atom', 'core', 'ultra'] -- name: set skl, icx, clx, spr to false +- name: set skl, icx, clx, spr, emr, gnr to false ansible.builtin.set_fact: is_skl: false is_clx: false is_clx_ncpu: false is_icx: false is_spr: false + is_emr: false + is_gnr: false - name: set is_skl architecture variable ansible.builtin.set_fact: @@ -62,6 +69,16 @@ is_spr: true when: cpu_id in confirmed_spr_cpus +- name: set is_emr architecture variable + ansible.builtin.set_fact: + is_emr: true + when: cpu_id in confirmed_emr_cpus + +- name: set is_gnr architecture variable + ansible.builtin.set_fact: + is_gnr: true + when: cpu_id in confirmed_gnr_cpus + - name: check if clx_ncpu mode ansible.builtin.set_fact: is_clx_ncpu: true diff --git a/roles/check_machine_type/vars/main.yml b/roles/check_machine_type/vars/main.yml index 5bb5013e..a97ee834 100644 --- a/roles/check_machine_type/vars/main.yml +++ b/roles/check_machine_type/vars/main.yml @@ -32,7 +32,6 @@ confirmed_core_cpus: - "12900" - "1360P" - confirmed_skl_cpus: # Sky Lake Xeon Gold (quad) - "6152" @@ -57,11 +56,14 @@ confirmed_icx_cpus: - "8358" - "8360Y" - "8380" + - "1747N" - "$0000%@" confirmed_spr_cpus: + - "5418N" - "6421N" - "6438N" + - "8468H" - "8470N" - "8471N" - "8478C" @@ -76,3 +78,6 @@ confirmed_emr_cpus: - "0000" - "8592" - "6548N" + +confirmed_gnr_cpus: + - "9999" diff --git a/roles/cluster_defaults/defaults/main.yml b/roles/cluster_defaults/defaults/main.yml index f96833f1..84b6d150 100644 --- a/roles/cluster_defaults/defaults/main.yml +++ b/roles/cluster_defaults/defaults/main.yml @@ -36,5 +36,5 @@ proxy_env: {} registry_containerd: "/var/lib/kubelet/config.json" kube_rbac_proxy_image_repo: "quay.io/brancz/kube-rbac-proxy" -kube_rbac_proxy_image_tag: "v0.14.3" +kube_rbac_proxy_image_tag: "v0.15.0" kube_rbac_proxy_tls_ciphers: "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305" # noqa yaml[line-length] diff --git a/roles/collectd_install/defaults/main.yml b/roles/collectd_install/defaults/main.yml index f58a6eaf..46492b5a 100644 --- a/roles/collectd_install/defaults/main.yml +++ b/roles/collectd_install/defaults/main.yml @@ -18,7 +18,7 @@ collectd_configuration_files_dir: "{{ (host_collectd_folder, 'collectd.conf.d') barometer_collectd_dir: "{{ (project_root_dir, 'barometer') | path_join }}" collectd_deployment_dir: "{{ (project_root_dir, 'k8s', 'collectd') | path_join }}" -collectd_scrap_interval: 30 +collectd_scrape_interval: 30 collectd_write_threads: 25 collectd_read_threads: 25 collectd_timeout: 2 @@ -97,6 +97,8 @@ collectd_plugins: - ovs_events - ovs_pmd_stats - ovs_stats + base_video_analytics: + - *full_nfv build_your_own: - *full_nfv on_prem_vss: diff --git a/roles/collectd_install/files/pkgpower.patch b/roles/collectd_install/files/pkgpower.patch new file mode 100644 index 00000000..8b2febc3 --- /dev/null +++ b/roles/collectd_install/files/pkgpower.patch @@ -0,0 +1,45 @@ +diff --git a/telemetry/pkgpower.py b/telemetry/pkgpower.py +index 295791a..61e7cf1 100644 +--- a/telemetry/pkgpower.py ++++ b/telemetry/pkgpower.py +@@ -106,6 +106,22 @@ def _get_node_id(nodepath): + match_nodepath = reg_ex.match(nodepath) + return int(match_nodepath.group("node")) + ++def _is_pkg_exists(cpu): ++ power_path = os.path.join( ++ BASE_POWERCAP_PATH, "intel-rapl:{}".format(cpu.node_id)) ++ max_power_file_path = os.path.join(power_path, "constraint_0_max_power_uw") ++ max_energy_file = os.path.join(power_path, "max_energy_range_uj") ++ energy_file = os.path.join(power_path, "energy_file") ++ if os.path.exists(power_path): ++ if (os.access(max_power_file_path, os.R_OK) and ++ os.access(max_energy_file, os.R_OK) and ++ os.access(energy_file, os.R_OK)): ++ return True ++ else: ++ return False ++ else: ++ return False ++ + def config_func(_unused_config): + ''' + call back function called by collectd, here +@@ -118,10 +134,13 @@ def config_func(_unused_config): + node_ids = [_get_node_id(np) for np in nodes] + for node in node_ids: + cpu = _CpuPowerStatus(node) +- _get_max_power_consumption(cpu) +- _get_tdp_power(cpu) +- _get_pkg_name(cpu) +- __CPUS += [cpu] ++ if _is_pkg_exists(cpu): ++ _get_max_power_consumption(cpu) ++ _get_tdp_power(cpu) ++ _get_pkg_name(cpu) ++ __CPUS += [cpu] ++ else: ++ continue + + def _read_pkg_power(cpu): + # first, read current power consumption value and timestamp diff --git a/roles/collectd_install/tasks/collectd.yml b/roles/collectd_install/tasks/collectd.yml index e7d4a0e6..90e6f4f9 100644 --- a/roles/collectd_install/tasks/collectd.yml +++ b/roles/collectd_install/tasks/collectd.yml @@ -32,15 +32,11 @@ group: root mode: 0755 -- name: check if namespace exists - command: kubectl get namespace {{ collectd_namespace }} - register: ns_exists - failed_when: no - changed_when: false - -- name: create a namespace - command: kubectl create namespace {{ collectd_namespace }} - when: '"NotFound" in ns_exists.stderr' +- name: Create collectd namespace + kubernetes.core.k8s: + kind: namespace + name: "{{ collectd_namespace }}" + state: present - name: template collectd-serviceMonitor template: diff --git a/roles/collectd_install/tasks/copy-configs.yml b/roles/collectd_install/tasks/copy-configs.yml index 9f8b759e..8e14c981 100644 --- a/roles/collectd_install/tasks/copy-configs.yml +++ b/roles/collectd_install/tasks/copy-configs.yml @@ -80,17 +80,24 @@ mode: 0755 - name: get PowerPkg file - get_url: - url: https://raw.githubusercontent.com/intel/CommsPowerManagement/73bdab6f6e3861f3d86e15e36c363f343a142509/telemetry/pkgpower.py - dest: "{{ collectd_configuration_files_dir }}/pkgpower.py" - owner: root - group: root - mode: 0755 - checksum: "sha1:4aa175aae132ca0d251d839462d23d971b039713" + block: + - name: download pkgpower file + ansible.builtin.get_url: + url: https://raw.githubusercontent.com/intel/CommsPowerManagement/73bdab6f6e3861f3d86e15e36c363f343a142509/telemetry/pkgpower.py + dest: "{{ collectd_configuration_files_dir }}/pkgpower.py" + owner: root + group: root + mode: 0755 + checksum: "sha1:4aa175aae132ca0d251d839462d23d971b039713" + register: pkgpower_download + until: pkgpower_download is not failed + retries: 5 + + - name: patch pkgpower file + ansible.posix.patch: + src: "pkgpower.patch" + dest: "{{ collectd_configuration_files_dir }}/pkgpower.py" when: "'pkgpower' in plugins" - register: pkgpower_download - until: pkgpower_download is not failed - retries: 5 # Segmentation core dump issues in Ubuntu were previously caused by lshw; blocks will be recommended such as, # hwinfo for RHEL / ROCKY & lshw for Ubuntu. diff --git a/roles/collectd_install/tasks/preflight.yml b/roles/collectd_install/tasks/preflight.yml index e0400946..418ba95d 100644 --- a/roles/collectd_install/tasks/preflight.yml +++ b/roles/collectd_install/tasks/preflight.yml @@ -20,3 +20,10 @@ msg: - Deployment profile '{{ profile_name }}' has no collectd plugins selection defined. - Please define collectd plugins selection for the current profile in {{ role_name }} role defaults. + +- name: Check that telegraf is disabled + ansible.builtin.assert: + that: + - not telegraf_enabled | default(false) + fail_msg: | + When Collectd is enabled then Telegraf must be disabled. diff --git a/roles/collectd_install/templates/collectd-serviceMonitor.yml.j2 b/roles/collectd_install/templates/collectd-serviceMonitor.yml.j2 index a522465b..4d0371e3 100644 --- a/roles/collectd_install/templates/collectd-serviceMonitor.yml.j2 +++ b/roles/collectd_install/templates/collectd-serviceMonitor.yml.j2 @@ -9,7 +9,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: {{ collectd_scrap_interval }}s + interval: {{ collectd_scrape_interval }}s port: https scheme: https tlsConfig: @@ -18,4 +18,4 @@ spec: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt selector: matchLabels: - app: collectd-service \ No newline at end of file + app: collectd-service diff --git a/roles/collectd_install/templates/collectd.conf b/roles/collectd_install/templates/collectd.conf index 481119eb..f8adc431 100644 --- a/roles/collectd_install/templates/collectd.conf +++ b/roles/collectd_install/templates/collectd.conf @@ -1,4 +1,4 @@ -Interval {{ collectd_scrap_interval }} +Interval {{ collectd_scrape_interval }} Timeout {{ collectd_timeout }} ReadThreads {{ collectd_read_threads }} WriteThreads {{ collectd_write_threads }} @@ -8,4 +8,3 @@ Hostname "{{ hostvars[inventory_hostname]['ansible_hostname'] }}" Filter "*.conf" - diff --git a/roles/container_engine/containerd/defaults/main.yml b/roles/container_engine/containerd/defaults/main.yml index c706b14a..c625b759 100644 --- a/roles/container_engine/containerd/defaults/main.yml +++ b/roles/container_engine/containerd/defaults/main.yml @@ -14,8 +14,8 @@ ## limitations under the License. ## --- -containerd_version: 1.7.3 -containerd_archive_checksum: "de7f61aacba88ee647a7dcde1ca77672ec44ab9fb3e58ae90c0efc9b2d8f3068" +containerd_version: 1.7.8 +containerd_archive_checksum: "5f1d017a5a7359514d6187d6656e88fb2a592d107e6298db7963dbddb9a111d9" containerd_download_url: "https://github.com/containerd/containerd/releases/download/v{{ containerd_version }}/containerd-{{ containerd_version }}-linux-amd64.tar.gz" # noqa yaml[line-length] containerd_bin_dir: "/usr/local/bin" @@ -71,8 +71,12 @@ containerd_metrics_address: "" containerd_metrics_grpc_histogram: false -containerd_registries: - "docker.io": "https://registry-1.docker.io" +containerd_registries_mirrors: + - prefix: docker.io + mirrors: + - host: https://registry-1.docker.io + capabilities: ["pull", "resolve"] + skip_verify: false containerd_max_container_log_line_size: -1 diff --git a/roles/container_engine/containerd/handlers/main.yml b/roles/container_engine/containerd/handlers/main.yml index fdda1525..053bdce7 100644 --- a/roles/container_engine/containerd/handlers/main.yml +++ b/roles/container_engine/containerd/handlers/main.yml @@ -16,6 +16,7 @@ --- - name: restart containerd command: /bin/true + changed_when: false notify: - containerd | restart containerd - containerd | wait for containerd @@ -32,4 +33,5 @@ register: containerd_ready retries: 8 delay: 4 + changed_when: false until: containerd_ready.rc == 0 diff --git a/roles/container_engine/containerd/templates/config.toml.j2 b/roles/container_engine/containerd/templates/config.toml.j2 index 8d005739..b60e3c00 100644 --- a/roles/container_engine/containerd/templates/config.toml.j2 +++ b/roles/container_engine/containerd/templates/config.toml.j2 @@ -38,20 +38,20 @@ oom_score = {{ containerd_oom_score }} {% endfor %} [plugins."io.containerd.grpc.v1.cri".registry] [plugins."io.containerd.grpc.v1.cri".registry.mirrors] -{% for registry, addr in containerd_registries.items() %} - [plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{ registry }}"] - endpoint = ["{{ ([ addr ] | flatten ) | join('","') }}"] +{% set insecure_registries_addr = [] %} +{% for registry in containerd_registries_mirrors %} + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{ registry.prefix }}"] +{% set endpoint = [] %} +{% for mirror in registry.mirrors %} +{% if endpoint.append(mirror.host) %}{% endif %} +{% if mirror.skip_verify is defined and mirror.skip_verify|bool %}{% if insecure_registries_addr.append(mirror.host | urlsplit('netloc')) %}{% endif %}{% endif %} {% endfor %} -{% if containerd_insecure_registries is defined and containerd_insecure_registries|length>0 %} -{% for registry, addr in containerd_insecure_registries.items() %} - [plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{ registry }}"] - endpoint = ["{{ ([ addr ] | flatten ) | join('","') }}"] + endpoint = ["{{ ( endpoint | unique ) | join('","') }}"] {% endfor %} -{% for addr in containerd_insecure_registries.values() | flatten | unique %} +{% for addr in insecure_registries_addr | unique %} [plugins."io.containerd.grpc.v1.cri".registry.configs."{{ addr }}".tls] insecure_skip_verify = true {% endfor %} -{% endif %} {% for registry in containerd_registry_auth if registry['registry'] is defined %} {% if (registry['username'] is defined and registry['password'] is defined) or registry['auth'] is defined %} [plugins."io.containerd.grpc.v1.cri".registry.configs."{{ registry['registry'] }}".auth] diff --git a/roles/container_engine/crictl/defaults/main.yml b/roles/container_engine/crictl/defaults/main.yml index 53f16a25..b5677f72 100644 --- a/roles/container_engine/crictl/defaults/main.yml +++ b/roles/container_engine/crictl/defaults/main.yml @@ -13,8 +13,8 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -crictl_version: "v1.27.0" -crictl_binary_checksum: "d335d6e16c309fbc3ff1a29a7e49bb253b5c9b4b030990bf7c6b48687f985cee" +crictl_version: "v1.28.0" +crictl_binary_checksum: "8dc78774f7cbeaf787994d386eec663f0a3cf24de1ea4893598096cb39ef2508" crictl_repo_url: "https://github.com/kubernetes-sigs/cri-tools/releases/download/" crictl_download_url: "{{ crictl_repo_url }}{{ crictl_version }}/crictl-{{ crictl_version }}-{{ ansible_system | lower }}-amd64.tar.gz" diff --git a/roles/container_engine/crio/defaults/main.yml b/roles/container_engine/crio/defaults/main.yml index 7361d665..4df3eb07 100644 --- a/roles/container_engine/crio/defaults/main.yml +++ b/roles/container_engine/crio/defaults/main.yml @@ -24,9 +24,9 @@ crio_log_level: "info" crio_metrics_port: "9090" crio_pause_image: "k8s.gcr.io/pause:3.9" -crio_version: "v1.27.0" +crio_version: "v1.28.1" crio_download_url: "https://storage.googleapis.com/cri-o/artifacts/cri-o.amd64.{{ crio_version }}.tar.gz" -crio_archive_checksums: "8f99db9aeea00299cb3f28ee61646472014cac91930e4c7551c9153f8f720093" +crio_archive_checksums: "63cee2e67e283e29d790caa52531bcca7bc59473fb73bde75f4fd8daa169d4bf" crio: version: "{{ crio_version }}" diff --git a/roles/container_engine/crio/handlers/main.yml b/roles/container_engine/crio/handlers/main.yml index edc114e5..938f9727 100644 --- a/roles/container_engine/crio/handlers/main.yml +++ b/roles/container_engine/crio/handlers/main.yml @@ -16,6 +16,7 @@ --- - name: restart crio command: /bin/true + changed_when: false notify: - reload systemd - reload crio diff --git a/roles/container_engine/docker/defaults/main.yml b/roles/container_engine/docker/defaults/main.yml index b14f7efd..7271f4e9 100644 --- a/roles/container_engine/docker/defaults/main.yml +++ b/roles/container_engine/docker/defaults/main.yml @@ -16,7 +16,7 @@ --- docker_version: "20.10.20" docker_cli_version: "{{ docker_version }}" -containerd_version: "1.6.16" # Containerd version installed when docker runtime is used +containerd_version: "1.6.18" # Containerd version installed when docker runtime is used containerd_package: 'containerd.io' diff --git a/roles/container_engine/docker/handlers/main.yml b/roles/container_engine/docker/handlers/main.yml index f35d5cd6..b4182f53 100644 --- a/roles/container_engine/docker/handlers/main.yml +++ b/roles/container_engine/docker/handlers/main.yml @@ -16,6 +16,7 @@ --- - name: restart docker command: /bin/true + changed_when: false notify: - reload systemd - reload docker @@ -32,6 +33,7 @@ - name: wait for docker command: "{{ docker_bin_dir }}/docker images" + changed_when: false register: docker_ready retries: 20 delay: 1 diff --git a/roles/container_engine/podman/tasks/main.yml b/roles/container_engine/podman/tasks/main.yml index c5a4bf0e..6268fac7 100644 --- a/roles/container_engine/podman/tasks/main.yml +++ b/roles/container_engine/podman/tasks/main.yml @@ -22,7 +22,7 @@ state: latest when: - '"crio" in container_runtime' - - ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04" + - ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==') - name: install podman package package: diff --git a/roles/container_engine/runc/defaults/main.yml b/roles/container_engine/runc/defaults/main.yml index c6ef9beb..45925810 100644 --- a/roles/container_engine/runc/defaults/main.yml +++ b/roles/container_engine/runc/defaults/main.yml @@ -16,10 +16,10 @@ --- runc_bin_dir: "/usr/local/bin" -runc_version: v1.1.8 +runc_version: v1.1.9 runc_download_url: "https://github.com/opencontainers/runc/releases/download/{{ runc_version }}/runc.amd64" -runc_binary_checksum: "1d05ed79854efc707841dfc7afbf3b86546fc1d0b3a204435ca921c14af8385b" +runc_binary_checksum: "b9bfdd4cb27cddbb6172a442df165a80bfc0538a676fbca1a6a6c8f4c6933b43" runc_binary: dest: "{{ (runc_bin_dir, 'runc') | path_join }}" diff --git a/roles/container_registry/charts/container-registry/templates/deployment.yaml b/roles/container_registry/charts/container-registry/templates/deployment.yaml index df73e8d7..d9b46f00 100644 --- a/roles/container_registry/charts/container-registry/templates/deployment.yaml +++ b/roles/container_registry/charts/container-registry/templates/deployment.yaml @@ -70,9 +70,6 @@ spec: nodeSelector: kubernetes.io/hostname: {{ .Values.node_name }} tolerations: - - effect: NoSchedule - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists diff --git a/roles/container_registry/defaults/main.yml b/roles/container_registry/defaults/main.yml index e848f8c2..0c780529 100644 --- a/roles/container_registry/defaults/main.yml +++ b/roles/container_registry/defaults/main.yml @@ -23,9 +23,9 @@ registry_password: registry_size: 10Gi registry_image: "docker.io/library/registry" -registry_version: 2.8.2 +registry_version: 2.8.3 registry_nginx_image: "docker.io/library/nginx" -registry_nginx_version: 1.25.2 +registry_nginx_version: 1.25.3 docker_pip_pkg_version: 6.0.0 registry_tls_secret_name: container-registry-tls diff --git a/roles/container_registry/tasks/tls.yml b/roles/container_registry/tasks/tls.yml index 11339b01..c7cf2a5b 100644 --- a/roles/container_registry/tasks/tls.yml +++ b/roles/container_registry/tasks/tls.yml @@ -80,7 +80,7 @@ state: present register: csr_signing - - name: Approve CSR to sign certificate + - name: Approve CSR to sign certificate # noqa no-handler ansible.builtin.command: >- kubectl -n {{ registry_namespace }} certificate approve {{ registry_csr_name }} changed_when: true diff --git a/roles/create_signed_k8s_certs/tasks/create_signed_k8s_certs.yml b/roles/create_signed_k8s_certs/tasks/create_signed_k8s_certs.yml index 587dee2e..a340b457 100644 --- a/roles/create_signed_k8s_certs/tasks/create_signed_k8s_certs.yml +++ b/roles/create_signed_k8s_certs/tasks/create_signed_k8s_certs.yml @@ -33,27 +33,17 @@ group: root become: yes -- name: check if namespace exists - command: kubectl get namespace {{ k8s_namespace }} - register: ns_exists - failed_when: no - changed_when: false - -- name: create a namespace - command: kubectl create namespace {{ k8s_namespace }} - when: - - '"NotFound" in ns_exists.stderr' - -- name: check if any pre-existing csr secrets exists in Kubernetes - command: kubectl get csr {{ secret_name }}.{{ k8s_namespace }} - register: csr_exists - failed_when: no - changed_when: false - -- name: delete any pre-existing certs/key/CSR from Kubernetes - command: kubectl delete csr {{ secret_name }}.{{ k8s_namespace }} - when: - - '"NotFound" not in csr_exists.stderr' +- name: Create namespace + kubernetes.core.k8s: + kind: Namespace + name: "{{ k8s_namespace }}" + state: present + +- name: Remove any pre-existing csr secrets + kubernetes.core.k8s: + kind: CertificateSigningRequest + name: "{{ secret_name }}.{{ k8s_namespace }}" + state: absent - name: populate CSR template template: diff --git a/roles/eck_install/defaults/main.yml b/roles/eck_install/defaults/main.yml new file mode 100644 index 00000000..fc084491 --- /dev/null +++ b/roles/eck_install/defaults/main.yml @@ -0,0 +1,20 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +eck_namespace: "monitoring" +eck_version: "2.10.0" +eck_elasticsearch_version: "8.11.3" +eck_crds_url: "https://download.elastic.co/downloads/eck/{{ eck_version }}/crds.yaml" +eck_operator_url: "https://download.elastic.co/downloads/eck/{{ eck_version }}/operator.yaml" diff --git a/roles/elasticsearch_install/tasks/cleanup.yml b/roles/eck_install/tasks/cleanup.yml similarity index 65% rename from roles/elasticsearch_install/tasks/cleanup.yml rename to roles/eck_install/tasks/cleanup.yml index ee9b4f90..eaae9463 100644 --- a/roles/elasticsearch_install/tasks/cleanup.yml +++ b/roles/eck_install/tasks/cleanup.yml @@ -15,28 +15,18 @@ ## --- - block: - - name: remove elasticsearch chart - kubernetes.core.helm: - release_name: "{{ elasticsearch_release_name }}" - release_namespace: "{{ elasticsearch_namespace }}" + - name: create elasticsearch resources + kubernetes.core.k8s: state: absent - failed_when: false - - - name: remove elasticsearch PVC - command: kubectl delete pvc elasticsearch-master-elasticsearch-master-0 -n monitoring - changed_when: false - failed_when: false - - - name: remove elasticsearch resources - command: kubectl delete -f ./ - args: - chdir: "{{ (project_root_dir, 'elasticsearch') | path_join }}" - failed_when: false - changed_when: false + src: "{{ (project_root_dir, 'elasticsearch', item) | path_join }}" + loop: + - elasticsearch_certs.yml + - elasticsearch_storageclass.yml + - elasticsearch_data.yml - name: get elasticsearch secrets ansible.builtin.shell: - cmd: kubectl get secrets -n monitoring | grep 'elasticsearch' | awk '{ print $1 }' + cmd: set -o pipefail && kubectl get secrets -n "{{ eck_namespace }}" | grep 'elasticsearch' | awk '{ print $1 }' args: executable: /bin/bash register: elasticsearch_secrets @@ -47,7 +37,7 @@ kubernetes.core.k8s: kind: secret name: "{{ item }}" - namespace: monitoring + namespace: "{{ eck_namespace }}" state: absent loop: "{{ elasticsearch_secrets.stdout_lines }}" failed_when: false diff --git a/roles/elasticsearch_install/tasks/main.yml b/roles/eck_install/tasks/main.yml similarity index 57% rename from roles/elasticsearch_install/tasks/main.yml rename to roles/eck_install/tasks/main.yml index 1bd42470..3caebd96 100644 --- a/roles/elasticsearch_install/tasks/main.yml +++ b/roles/eck_install/tasks/main.yml @@ -32,39 +32,26 @@ dest: "{{ (project_root_dir, 'elasticsearch') | path_join }}" mode: 0755 - - name: copy elasticsearch files - ansible.builtin.copy: - src: files/ - dest: "{{ (project_root_dir, 'elasticsearch') | path_join }}" - owner: root - group: root - mode: 0744 + # In the absence of this configuration, Elasticsearch cannot be started. + # https://www.elastic.co/guide/en/elasticsearch/reference/8.11/_maximum_map_count_check.html + - name: Set map count + ansible.posix.sysctl: + name: vm.max_map_count + value: '5262144' + state: present + delegate_to: "{{ groups['kube_node'][0] }}" # fix for core detection in the elasticsearch - block: - - name: check if we are running single-node deployment - shell: set -o pipefail && kubectl get nodes | awk 'NR !=1 { print }' - args: - executable: /bin/bash - register: node_check - changed_when: false - - - name: get number of cores - control plane - command: nproc - register: number_of_cores - when: node_check.stdout_lines | length == 1 - - - set_fact: nproc_output="{{ number_of_cores }}" - when: number_of_cores.changed - - - name: get number of cores - worker node - command: nproc + - name: Fetch number of cores on machine + ansible.builtin.command: nproc register: number_of_cores - delegate_to: "{{ groups['kube_node'][0] }}" - when: node_check.stdout_lines | length > 1 + changed_when: false + delegate_to: "{{ groups['kube_node'][0] }}" # in single-node, master is defined as kube_node as well - - set_fact: nproc_output="{{ number_of_cores }}" - when: number_of_cores.changed + - name: Set number of cores for elasticsearch config + ansible.builtin.set_fact: + nproc_output: "{{ number_of_cores.stdout }}" - name: populate elasticsearch files and push to controller node ansible.builtin.template: @@ -77,10 +64,15 @@ - name: create monitoring namespace kubernetes.core.k8s: - name: "monitoring" + name: "{{ eck_namespace }}" kind: Namespace state: present + - name: install ECK custom resource definitions + kubernetes.core.k8s: + state: present + src: "{{ eck_crds_url }}" + - name: create elasticsearch resources kubernetes.core.k8s: state: present @@ -88,13 +80,13 @@ loop: - elasticsearch_certs.yml - elasticsearch_storageclass.yml - - elasticsearch_pv.yml + - elasticsearch_data.yml - name: Wait till the elasticsearch certificate is created kubernetes.core.k8s_info: kind: Certificate name: elasticsearch-tls - namespace: monitoring + namespace: "{{ eck_namespace }}" wait: yes wait_condition: type: Ready @@ -102,22 +94,26 @@ wait_sleep: 10 wait_timeout: 360 - - name: add elasticsearch chart repo - kubernetes.core.helm_repository: - name: "{{ elasticsearch_chart_name }}" - repo_url: "{{ elasticsearch_chart_repo }}" + - name: install ECK operator + kubernetes.core.k8s: + state: present + namespace: "{{ eck_namespace }}" + src: "{{ eck_operator_url }}" - name: deploy elasticsearch - kubernetes.core.helm: - chart_ref: "elastic/elasticsearch" - chart_version: "{{ elasticsearch_chart_version }}" - release_name: "{{ elasticsearch_release_name }}" - release_namespace: "{{ elasticsearch_namespace }}" - values_files: "{{ (project_root_dir, 'elasticsearch', 'elasticsearch_values.yml') | path_join }}" - wait: true - timeout: 15m0s - - - name: create elasticsearch settings pod kubernetes.core.k8s: state: present - src: "{{ (project_root_dir, 'elasticsearch', 'elasticsearch_settings.yml') | path_join }}" + src: "{{ (project_root_dir, 'elasticsearch', item) | path_join }}" + loop: + - elasticsearch.yml + + - name: Wait till the elasticsearch is Ready + kubernetes.core.k8s_info: + kind: Elasticsearch + name: elasticsearch-main + namespace: "{{ eck_namespace }}" + wait: true + wait_condition: + type: ElasticsearchIsReachable + status: "True" + wait_timeout: 360 diff --git a/roles/eck_install/templates/elasticsearch.yml.j2 b/roles/eck_install/templates/elasticsearch.yml.j2 new file mode 100644 index 00000000..182202a8 --- /dev/null +++ b/roles/eck_install/templates/elasticsearch.yml.j2 @@ -0,0 +1,67 @@ +apiVersion: elasticsearch.k8s.elastic.co/v1 +kind: Elasticsearch +metadata: + name: elasticsearch-main + namespace: "{{ eck_namespace }}" +spec: + version: "{{ eck_elasticsearch_version }}" + nodeSets: + - name: es-cluster-node + count: 1 + volumeClaimTemplates: + - metadata: + name: elasticsearch-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 30Gi + storageClassName: elasticsearch + podTemplate: + metadata: + labels: + # additional labels for pods + app: elasticsearch + spec: + # needed to be run on node where vm.max_map_count was set + affinity: + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: +{% if on_cloud|d(false) %} + - {{ hostvars[groups['kube_node'][0]]['ansible_nodename'] }} +{% else %} + - {{ hostvars[groups['kube_node'][0]]['ansible_hostname'] }} +{% endif %} + containers: + - name: elasticsearch + resources: + limits: + memory: 8Gi + cpu: 2 + env: +{% if "http_proxy" in proxy_env %} + - name: http_proxy + value: {{ proxy_env.http_proxy }} +{% endif %} +{% if "https_proxy" in proxy_env %} + - name: https_proxy + value: {{ proxy_env.https_proxy }} +{% endif %} +{% if "no_proxy" in proxy_env %} + - name: no_proxy + value: {{ proxy_env.no_proxy }} +{% endif %} + http: + service: + spec: + # expose this cluster Service with a ClusterIP and add public ingress for access + type: ClusterIP + tls: + certificate: + secretName: elasticsearch-tls diff --git a/roles/elasticsearch_install/files/elasticsearch_certs.yml b/roles/eck_install/templates/elasticsearch_certs.yml.j2 similarity index 77% rename from roles/elasticsearch_install/files/elasticsearch_certs.yml rename to roles/eck_install/templates/elasticsearch_certs.yml.j2 index 8443f084..6773abfd 100644 --- a/roles/elasticsearch_install/files/elasticsearch_certs.yml +++ b/roles/eck_install/templates/elasticsearch_certs.yml.j2 @@ -2,7 +2,7 @@ apiVersion: cert-manager.io/v1 kind: Issuer metadata: name: selfsigned-elasticsearch-tls-issuer-ca - namespace: monitoring + namespace: "{{ eck_namespace }}" spec: selfSigned: {} --- @@ -10,7 +10,7 @@ apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: elasticsearch-tls-ca - namespace: monitoring + namespace: "{{ eck_namespace }}" spec: isCA: true commonName: elasticsearch-tls-ca @@ -27,7 +27,7 @@ apiVersion: cert-manager.io/v1 kind: Issuer metadata: name: elasticsearch-tls-ca-issuer - namespace: monitoring + namespace: "{{ eck_namespace }}" spec: ca: secretName: elasticsearch-tls-ca @@ -36,14 +36,14 @@ apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: elasticsearch-tls - namespace: monitoring + namespace: "{{ eck_namespace }}" spec: secretName: elasticsearch-tls dnsNames: - - elasticsearch-master.monitoring.svc.cluster.local - - elasticsearch-master.monitoring.svc - - elasticsearch-master.monitoring - - elasticsearch-master + - elasticsearch-main-es-http.monitoring.svc.cluster.local + - elasticsearch-main-es-http.monitoring.svc + - elasticsearch-main-es-http.monitoring + - elasticsearch-main-es-http - elasticsearch - kibana-kibana.monitoring.svc.cluster.local - kibana-kibana.monitoring.svc diff --git a/roles/elasticsearch_install/templates/elasticsearch_pv.yml.j2 b/roles/eck_install/templates/elasticsearch_data.yml.j2 similarity index 70% rename from roles/elasticsearch_install/templates/elasticsearch_pv.yml.j2 rename to roles/eck_install/templates/elasticsearch_data.yml.j2 index b466ed97..fe6aaf59 100644 --- a/roles/elasticsearch_install/templates/elasticsearch_pv.yml.j2 +++ b/roles/eck_install/templates/elasticsearch_data.yml.j2 @@ -2,16 +2,16 @@ apiVersion: v1 kind: PersistentVolume metadata: - name: elasticsearch-pv - namespace: monitoring + name: elasticsearch-data + namespace: "{{ eck_namespace }}" labels: - name: elasticsearch-pv - app.kubernetes.io/component: elasticsearch-pv - app.kubernetes.io/name: elasticsearch-pv - app.kubernetes.io/version: {{ elasticsearch_chart_version }} + name: elasticsearch-data + app.kubernetes.io/component: elasticsearch-data + app.kubernetes.io/name: elasticsearch-data + app.kubernetes.io/version: {{ eck_version }} spec: capacity: - storage: 30Gi + storage: 35Gi volumeMode: Filesystem accessModes: ["ReadWriteOnce"] persistentVolumeReclaimPolicy: Retain diff --git a/roles/elasticsearch_install/templates/elasticsearch_storageclass.yml.j2 b/roles/eck_install/templates/elasticsearch_storageclass.yml.j2 similarity index 75% rename from roles/elasticsearch_install/templates/elasticsearch_storageclass.yml.j2 rename to roles/eck_install/templates/elasticsearch_storageclass.yml.j2 index 559a0022..523c25d1 100644 --- a/roles/elasticsearch_install/templates/elasticsearch_storageclass.yml.j2 +++ b/roles/eck_install/templates/elasticsearch_storageclass.yml.j2 @@ -3,11 +3,11 @@ apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: elasticsearch - namespace: monitoring + namespace: "{{ eck_namespace }}" labels: app.kubernetes.io/component: elasticsearch app.kubernetes.io/name: elasticsearch - app.kubernetes.io/version: {{ elasticsearch_chart_version }} + app.kubernetes.io/version: {{ eck_version }} provisioner: kubernetes.io/no-provisioner reclaimPolicy: Retain allowVolumeExpansion: false diff --git a/roles/elasticsearch_install/templates/elasticsearch_settings.yml.j2 b/roles/elasticsearch_install/templates/elasticsearch_settings.yml.j2 deleted file mode 100644 index a8b4b0fc..00000000 --- a/roles/elasticsearch_install/templates/elasticsearch_settings.yml.j2 +++ /dev/null @@ -1,86 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: elasticsearch-stack-settings-script - namespace: monitoring -data: - elastic-stack-settings.sh: | - #!/bin/sh - - apk add --no-cache curl jq - - sleep 300 - - UNAVAILABLE=true - - while $UNAVAILABLE; do - status=$(curl -s -k -XGET \ - --user "${ELASTIC_USERNAME}:${ELASTIC_PASS}" \ - https://elasticsearch-master.monitoring.svc.cluster.local:9200/_cluster/health | jq '.status' | tr -d '"') - if [[ "$status" == "green" || "$status" == "yellow" ]]; then - UNAVAILABLE=false - else - echo "Elasticsearch stack is not ready. Trying again after 30s..." - sleep 30 - fi - done - - result=$(curl -s -k -XPUT \ - --user "${ELASTIC_USERNAME}:${ELASTIC_PASS}" \ - -H "Content-Type: application/json" \ - https://elasticsearch-master.monitoring.svc.cluster.local:9200/_all/_settings?preserve_existing=true \ - -d '{"index.auto_expand_replicas": "0-all", "index.number_of_replicas": "1"}' | jq '.acknowledged') - - if [ "$result" == true ]; then - exit 0 - else - echo "Elasticstack replicas settings was not set properly." - exit 1 - fi ---- -apiVersion: v1 -kind: Pod -metadata: - name: elasticsearch-stack-settings - namespace: monitoring -spec: - restartPolicy: OnFailure - containers: - - name: elasticsearch-stack-settings - image: alpine:3.18 - command: ["/root/elastic-stack-settings.sh"] - env: -{% if "http_proxy" in proxy_env %} - - name: http_proxy - value: {{ proxy_env.http_proxy }} -{% endif %} -{% if "https_proxy" in proxy_env %} - - name: https_proxy - value: {{ proxy_env.https_proxy }} -{% endif %} -{% if "no_proxy" in proxy_env %} - - name: no_proxy - value: {{ proxy_env.no_proxy }} -{% endif %} - - name: ELASTIC_PASS - valueFrom: - secretKeyRef: - name: elasticsearch-master-credentials - key: password - - name: ELASTIC_USERNAME - valueFrom: - secretKeyRef: - name: elasticsearch-master-credentials - key: username - volumeMounts: - - name: elasticsearch-stack-settings-script - mountPath: "/root/" - volumes: - - name: elasticsearch-stack-settings-script - configMap: - name: elasticsearch-stack-settings-script - defaultMode: 0500 - items: - - key: elastic-stack-settings.sh - path: elastic-stack-settings.sh diff --git a/roles/elasticsearch_install/templates/elasticsearch_values.yml.j2 b/roles/elasticsearch_install/templates/elasticsearch_values.yml.j2 deleted file mode 100644 index e9b3932f..00000000 --- a/roles/elasticsearch_install/templates/elasticsearch_values.yml.j2 +++ /dev/null @@ -1,63 +0,0 @@ -replicas: 1 -minimumMasterNodes: 1 -createCert: false - -networkHost: "0.0.0.0" - -resources: - requests: - cpu: "1000m" - memory: "2Gi" - limits: - cpu: "1000m" - memory: "3Gi" - -volumeClaimTemplate: - accessModes: ["ReadWriteOnce"] - storageClassName: elasticsearch - selector: - matchLabels: - name: "elasticsearch-pv" - resources: - requests: - storage: 30Gi - -extraEnvs: - - name: ingest.geoip.downloader.enabled - value: "false" - -persistence: - enabled: true - labels: - # Add default labels for the volumeClaimTemplate of the StatefulSet - enabled: false - annotations: {} - -# Disable it to use your own elastic-credential Secret. -secret: - enabled: true - password: "" # generated randomly if not defined - -esJvmOptions: - processors.options: | - -XX:ActiveProcessorCount={{ nproc_output.stdout }} - -esConfig: - elasticsearch.yml: | - xpack.security.enabled: true - xpack.security.transport.ssl.enabled: true - xpack.security.transport.ssl.verification_mode: certificate - xpack.security.transport.ssl.client_authentication: required - xpack.security.transport.ssl.key: /usr/share/elasticsearch/config/certs/tls.key - xpack.security.transport.ssl.certificate: /usr/share/elasticsearch/config/certs/tls.crt - xpack.security.transport.ssl.certificate_authorities: /usr/share/elasticsearch/config/certs/ca.crt - xpack.security.http.ssl.enabled: true - xpack.security.http.ssl.client_authentication: optional - xpack.security.http.ssl.key: /usr/share/elasticsearch/config/certs/tls.key - xpack.security.http.ssl.certificate: /usr/share/elasticsearch/config/certs/tls.crt - xpack.security.http.ssl.certificate_authorities: /usr/share/elasticsearch/config/certs/ca.crt - -secretMounts: - - name: elastic-certificates - secretName: elasticsearch-tls - path: /usr/share/elasticsearch/config/certs diff --git a/roles/ffmpeg_install/defaults/main.yml b/roles/ffmpeg_install/defaults/main.yml index b98ec6c7..ea997809 100644 --- a/roles/ffmpeg_install/defaults/main.yml +++ b/roles/ffmpeg_install/defaults/main.yml @@ -18,7 +18,7 @@ ffmpeg_path: "{{ (project_root_dir, 'ffmpeg') | path_join }}" ffmpeg_patch_path: "{{ (ffmpeg_path, 'ffmpeg_patch') | path_join }}" # ffmpeg upstream base version -ffmpeg_commit_hash: "9b6d191" +ffmpeg_commit_hash: "9e1ea3c" ffmpeg_git_url: "https://github.com/FFmpeg/FFmpeg.git" ffmpeg_configure_options_gpu: "--enable-shared --enable-vaapi --enable-libvpl" ffmpeg_configure_options_cpu: "--enable-shared" diff --git a/roles/ffmpeg_install/vars/main.yml b/roles/ffmpeg_install/vars/main.yml index b86841ea..364ebd67 100644 --- a/roles/ffmpeg_install/vars/main.yml +++ b/roles/ffmpeg_install/vars/main.yml @@ -65,11 +65,9 @@ install_dependencies: - zlib-devel - nasm - yasm-devel - - libx264-devel - - libx265-devel + - libva-devel - fdk-aac-free - lame-devel - opus - - libvpx6-devel - libmfx-devel - libvpl-devel diff --git a/roles/gpu_dp_install/tasks/preflight_gpu_dp.yml b/roles/gpu_dp_install/tasks/preflight_gpu_dp.yml index a865e785..5c5f43c8 100644 --- a/roles/gpu_dp_install/tasks/preflight_gpu_dp.yml +++ b/roles/gpu_dp_install/tasks/preflight_gpu_dp.yml @@ -17,11 +17,17 @@ - name: Preflight--Check OS support assert: - that: ((ansible_distribution == "Ubuntu") and (ansible_distribution_version == '22.04')) + that: ((ansible_distribution == "Ubuntu") and (ansible_distribution_version is version('22.04', '=='))) or + ((ansible_os_family == "RedHat") and (ansible_distribution_version is version('9.2', '=='))) msg: >- - Currently GPU driver supports Ubuntu 22.04 while Redhat support is under development. + Currently GPU driver supports Ubuntu 22.04 and Redhat/Rocky 9.2. Please check https://dgpu-docs.intel.com/index.html for hardware support details +- name: Preflight--Container runtime check + assert: + that: container_runtime != "docker" + msg: "Intel GPU Device Plugin does not support docker as container runtime." + # Check GPU availability and preparation, the pre-defined product PCIID can be found in i915_pciids.h of kernel source tree, only DG1 is available now. - name: Preflight--Create empty setting list for configure_gpu on the worker node set_fact: @@ -60,7 +66,7 @@ - name: Preflight--Validate the PCIID from each node is supported assert: - that: "'{{ (item | regex_search('8086:[0-9a-zA-Z]{4}')).split(':')[1] }}' in {{ gpu_pciids }}" + that: (item | regex_search('8086:[0-9a-zA-Z]{4}')).split(':')[1] in gpu_pciids msg: - "{{ (item | regex_search('8086:[0-9a-zA-Z]{4}')).split(':')[1] }} from " - " {{ inventory_hostname }} is not in the pre-defined PCIID list of {{ gpu_pciids }}" diff --git a/roles/gpu_dp_install/templates/intel-gpu-plugin.yml.j2 b/roles/gpu_dp_install/templates/intel-gpu-plugin.yml.j2 index df0ea412..0a6a2306 100644 --- a/roles/gpu_dp_install/templates/intel-gpu-plugin.yml.j2 +++ b/roles/gpu_dp_install/templates/intel-gpu-plugin.yml.j2 @@ -13,5 +13,3 @@ spec: preferredAllocationPolicy: {{ gpu_dp_prefered_allocation | default('none') }} nodeSelector: intel.feature.node.kubernetes.io/gpu: "true" - # check if node has required PCI IDs - feature.node.kubernetes.io/pci-0380_8086.present: 'true' diff --git a/roles/imtl_install/defaults/main.yml b/roles/imtl_install/defaults/main.yml index 0767be20..75821dce 100644 --- a/roles/imtl_install/defaults/main.yml +++ b/roles/imtl_install/defaults/main.yml @@ -29,3 +29,9 @@ imtl_min_fw_version_supported: "4.20" imtl_dpdk_patches_strip: 1 imtl_ice_patches_strip: 1 + +# Update following ranges based on patches available and testing +imtl_min_ice_version: "1.9.11" +imtl_max_ice_version: "1.11.14" +imtl_min_dpdk_version: "21.05" +imtl_max_dpdk_version: "23.03" diff --git a/roles/imtl_install/tasks/preflight.yml b/roles/imtl_install/tasks/preflight.yml index bf832d1b..5ea3195a 100644 --- a/roles/imtl_install/tasks/preflight.yml +++ b/roles/imtl_install/tasks/preflight.yml @@ -33,6 +33,14 @@ Please set update_nic_drivers option to 'true'. when: intel_media_transport_library.patch_nic_driver | default(false) +- name: Check ICE driver version support + ansible.builtin.assert: + that: + - ice_driver_version | default('1.0') is ansible.builtin.version('{{ imtl_min_ice_version }}', '>=') + - ice_driver_version | default('1.0') is ansible.builtin.version('{{ imtl_max_ice_version }}', '<=') + fail_msg: | + Intel Media Transport Library {{ imtl_version }} does support only ICE versions >= {{ imtl_min_ice_version }} and <= {{ imtl_max_ice_version }}. + - name: Check iommu enabled ansible.builtin.assert: that: @@ -52,10 +60,10 @@ - name: Check DPDK version support ansible.builtin.assert: that: - - dpdk_version | default('1.0') is ansible.builtin.version('23.07', '<') - - dpdk_version | default('1.0') is ansible.builtin.version('21.05', '>=') + - dpdk_version | default('1.0') is ansible.builtin.version(imtl_max_dpdk_version, '<=') + - dpdk_version | default('1.0') is ansible.builtin.version(imtl_min_dpdk_version, '>=') fail_msg: | - Intel Media Transport Library {{ imtl_version }} does support only dpdk versions >= 21.05 and < 23.07. + Intel Media Transport Library {{ imtl_version }} does support only dpdk versions >= {{ imtl_min_dpdk_version }} and <= {{ imtl_max_dpdk_version }}. - name: Check target NIC FW version to be used for update ansible.builtin.assert: diff --git a/roles/imtl_install/tasks/redhat_deps.yml b/roles/imtl_install/tasks/redhat_deps.yml index 65728395..828be9a1 100644 --- a/roles/imtl_install/tasks/redhat_deps.yml +++ b/roles/imtl_install/tasks/redhat_deps.yml @@ -20,57 +20,10 @@ version: "{{ item.version }}" force: true loop: - - name: jsonc - url: https://github.com/json-c/json-c.git - version: json-c-0.16 - - name: libpcap - url: https://github.com/the-tcpdump-group/libpcap.git - version: libpcap-1.9 - name: gtest url: https://github.com/google/googletest.git version: v1.13.x -- name: Build jsonc - vars: - build_dir: "{{ (imtl_deps_dir, 'jsonc/build') | path_join }}" - block: - - name: (jsonc) Create build dir - ansible.builtin.file: - path: "{{ build_dir }}" - state: directory - mode: 0750 - - name: (jsonc) Run cmake - ansible.builtin.command: - cmd: cmake ../ - chdir: "{{ build_dir }}" - changed_when: true # TDOD - - name: (jsonc) Run make - community.general.make: - chdir: "{{ build_dir }}" - - name: (jsonc) Run make install - become: true - community.general.make: - target: install - chdir: "{{ build_dir }}" - -- name: Build libpcap - vars: - build_dir: "{{ (imtl_deps_dir, 'libpcap') | path_join }}" - block: - - name: (libpcap) Run configure - ansible.builtin.command: - cmd: ./configure - chdir: "{{ build_dir }}" - changed_when: true # TDOD - - name: (libpcap) Run make - community.general.make: - chdir: "{{ build_dir }}" - - name: (libpcap) Run make install - become: true - community.general.make: - target: install - chdir: "{{ build_dir }}" - - name: Build gtest vars: build_dir: "{{ (imtl_deps_dir, 'gtest/build') | path_join }}" diff --git a/roles/imtl_install/vars/main.yml b/roles/imtl_install/vars/main.yml index cdc2144a..89cc0f11 100644 --- a/roles/imtl_install/vars/main.yml +++ b/roles/imtl_install/vars/main.yml @@ -34,6 +34,8 @@ install_dependencies: - numactl-devel - libasan - SDL2-devel + - json-c-devel + - libpcap-devel # Following ones needed to build other dependencies from source - cmake - flex diff --git a/roles/infrastructure_power_manager/defaults/main.yml b/roles/infrastructure_power_manager/defaults/main.yml new file mode 100644 index 00000000..0016c097 --- /dev/null +++ b/roles/infrastructure_power_manager/defaults/main.yml @@ -0,0 +1,26 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## + +# IPM dpdk patches +ipm_dpdk_patches_git_url: https://github.com/intel/CommsPowerManagement/ +ipm_dpdk_patches_commit: 69a53d2b0068d45c592d0fdc6f82b9f354fa9d84 +ipm_dpdk_patches_repo_dir: "{{ (project_root_dir, 'ipm_dpdk_patches') | path_join }}" +ipm_dpdk_patches_base_dir: "{{ ipm_dpdk_patches_repo_dir }}/ipm/patches/dpdk/" +# IPM dpdk patches source has patches only for major.minor DPDK versions, strip patch version if present +ipm_stripped_dpdk_version: "{{ dpdk_version | regex_replace('^([0-9]+).([0-9]+).*$', '\\1.\\2') }}" +ipm_dpdk_patches_strip: 1 + +ipm_max_dpdk_version_supported: "22.11.1" diff --git a/roles/infrastructure_power_manager/tasks/dpdk_patch.yml b/roles/infrastructure_power_manager/tasks/dpdk_patch.yml new file mode 100644 index 00000000..9c9e626c --- /dev/null +++ b/roles/infrastructure_power_manager/tasks/dpdk_patch.yml @@ -0,0 +1,131 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: Clone repository with DPDK patches for IPM + ansible.builtin.git: + repo: "{{ ipm_dpdk_patches_git_url }}" + dest: "{{ ipm_dpdk_patches_repo_dir }}" + version: "{{ ipm_dpdk_patches_commit }}" + force: true + +- name: Check patches dir exist for defined DPDK version + ansible.builtin.stat: + path: "{{ (ipm_dpdk_patches_base_dir, ipm_stripped_dpdk_version) | path_join }}" + get_checksum: false + get_mime: false + get_attributes: false + register: patch_dir + +- name: Select patch files if no patches available for defined version + when: not patch_dir.stat.exists + block: + - name: Get all versions of DPDK patches + ansible.builtin.find: + paths: "{{ ipm_dpdk_patches_base_dir }}" + file_type: directory + register: dpdk_patches_dirs + + - name: Set newest available version of patches + ansible.builtin.set_fact: + newest_patch_version: "{{ (dpdk_patches_dirs.files | map(attribute='path') | map('basename') | list | community.general.version_sort)[-1] }}" + + - name: Warn about using patches for different DPDK version + ansible.builtin.debug: + msg: | + IPM does not provide patches for DPDK v{{ dpdk_version }}. + Patches for highest available version of DPDK (v{{ newest_patch_version }}) used instead! + +- name: Set version of DPDK patches to use + ansible.builtin.set_fact: + ipm_dpdk_patches_dir: >- + {% if patch_dir.stat.exists -%} + {{ (ipm_dpdk_patches_base_dir, ipm_stripped_dpdk_version) | path_join }} + {%- else -%} + {{ (ipm_dpdk_patches_base_dir, newest_patch_version) | path_join }} + {%- endif %} + +- name: Handle DPDK patch symlink files + block: + - name: Identify links in DPDK patches + ansible.builtin.find: + paths: "{{ ipm_dpdk_patches_dir }}" + recurse: true + file_type: link + register: patch_link_list + + - name: Get link targets for each patch link + stat: + path: "{{ item }}" + loop: "{{ (patch_link_list.files | map(attribute='path') | list | sort) }}" + register: patch_links + + - name: replace links + copy: + remote_src: true + src: "{{ item.stat.lnk_source }}" + dest: "{{ item.stat.path }}" + mode: '0644' + loop: "{{ patch_links.results }}" + +- name: Handle patch files containing only link to other file + block: + - name: Find patch files that contain only link to another patch file + ansible.builtin.find: + paths: "{{ ipm_dpdk_patches_dir }}" + recurse: true + contains: '^\.\.\/\S*$' + read_whole_file: true + register: link_file_list + + - name: Find original patch files from links + ansible.builtin.slurp: + src: "{{ item.path }}" + register: link_points_list + loop: "{{ link_file_list.files }}" + when: link_file_list.matched > 0 + + - name: Replace identified link files with real patch files + ansible.builtin.copy: + remote_src: true + src: "{{ (item.source | dirname, item.content | b64decode) | path_join | realpath }}" + dest: "{{ item.source }}" + mode: '0644' + force: true + loop: "{{ link_points_list.results }}" + when: link_file_list.matched > 0 + +- name: Identify DPDK patches to be used + ansible.builtin.find: + paths: "{{ ipm_dpdk_patches_dir }}" + recurse: true + register: patch_file_list + +- block: + - name: Apply DPDK patches + vars: + # Sort patch files + patch_files: "{{ (patch_file_list.files | map(attribute='path') | list | sort) }}" + ansible.posix.patch: + remote_src: true + src: "{{ item }}" + basedir: "{{ dpdk_dir }}" + strip: "{{ ipm_dpdk_patches_strip }}" + state: present + loop: "{{ patch_files }}" + when: patch_file_list.matched > 0 + rescue: + - name: Report unsupported DPDK version + ansible.builtin.fail: + msg: "Patch of DPDK with files provided by IPM failed. It is likely IPM does not support DPDK v{{ dpdk_version }}." diff --git a/roles/kubernetes_ingress_install/tasks/main.yml b/roles/infrastructure_power_manager/tasks/main.yml old mode 100755 new mode 100644 similarity index 80% rename from roles/kubernetes_ingress_install/tasks/main.yml rename to roles/infrastructure_power_manager/tasks/main.yml index 0f38b7b0..00f08efc --- a/roles/kubernetes_ingress_install/tasks/main.yml +++ b/roles/infrastructure_power_manager/tasks/main.yml @@ -13,8 +13,5 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## ---- -- name: install kubernetes ingress controller - import_tasks: kubernetes_ingress_install.yml - when: - - minio_enabled is defined and minio_enabled + +# TODO implement IPM using NDA package diff --git a/roles/infrastructure_power_manager/tasks/preflight.yml b/roles/infrastructure_power_manager/tasks/preflight.yml new file mode 100644 index 00000000..22a6257c --- /dev/null +++ b/roles/infrastructure_power_manager/tasks/preflight.yml @@ -0,0 +1,32 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: Check dpdk is enabled + ansible.builtin.assert: + that: + - install_dpdk | default(false) + fail_msg: + - "IPM requires DPDK to be enabled on host." + - "Please set install_dpdk: true in host_vars." + when: inventory_hostname in groups['kube_node'] + +- name: Check dpdk supported version + ansible.builtin.assert: + that: + - dpdk_version is version(ipm_max_dpdk_version_supported, '<=') + fail_msg: + - "IPM DPDK patches can be applied only to dpdk version <= {{ ipm_max_dpdk_version_supported }}." + - Please change the DPDK version in host_vars. + when: inventory_hostname in groups['kube_node'] diff --git a/roles/ingress_nginx_install/defaults/main.yml b/roles/ingress_nginx_install/defaults/main.yml new file mode 100644 index 00000000..feb5f428 --- /dev/null +++ b/roles/ingress_nginx_install/defaults/main.yml @@ -0,0 +1,27 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +ingress_nginx_helm_repo_name: ingress-nginx-repo +ingress_nginx_helm_repo_url: https://kubernetes.github.io/ingress-nginx + +ingress_nginx_helm_version: 4.8.3 +ingress_nginx_helm_name: "{{ ingress_nginx_helm_repo_name }}/ingress-nginx" +ingress_nginx_helm_release_name: ingress-nginx +ingress_nginx_namespace: "ingress-nginx" + +ingress_nginx_dir: "{{ (project_root_dir, 'ingress-nginx') | path_join }}" + +ingress_nodeport_http: 30123 +ingress_nodeport_https: 30124 diff --git a/roles/kubernetes_ingress_install/defaults/main.yml b/roles/ingress_nginx_install/tasks/cleanup.yml old mode 100755 new mode 100644 similarity index 59% rename from roles/kubernetes_ingress_install/defaults/main.yml rename to roles/ingress_nginx_install/tasks/cleanup.yml index 4a3edb9d..44b4983b --- a/roles/kubernetes_ingress_install/defaults/main.yml +++ b/roles/ingress_nginx_install/tasks/cleanup.yml @@ -13,12 +13,14 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## ---- -kubernetes_ingress_application_name: "minio_kubernetes_ingress" -kubernetes_ingress_release_name: "minio-kubernetes-ingress" +- name: Remove ingress-nginx helm chart + kubernetes.core.helm: + release_name: "{{ ingress_nginx_helm_release_name }}" + namespace: "{{ ingress_nginx_namespace }}" + state: absent + failed_when: false # TODO rework cleanup -kubernetes_ingress_helm_repo_url: "https://helm.nginx.com/stable" -kubernetes_ingress_helm_chart_repo_name: "nginx-stable" -kubernetes_ingress_helm_chart_ref: "nginx-stable/nginx-ingress" -kubernetes_ingress_helm_chart_version: "v2.3.0" -kubernetes_ingress_helm_chart_release_namespace: "minio-tenant" +- name: Delete ingress-nginx directory + ansible.builtin.file: + path: "{{ ingress_nginx_dir }}" + state: absent diff --git a/roles/ingress_nginx_install/tasks/main.yml b/roles/ingress_nginx_install/tasks/main.yml new file mode 100644 index 00000000..baad5370 --- /dev/null +++ b/roles/ingress_nginx_install/tasks/main.yml @@ -0,0 +1,45 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: Create ingress-nginx directory + ansible.builtin.file: + path: "{{ ingress_nginx_dir }}" + state: directory + mode: '0750' + +- name: Template ingress-nginx helm values + ansible.builtin.template: + src: helm_values.yaml.j2 + dest: "{{ ingress_nginx_dir }}/helm_values.yaml" + mode: '0640' + +- name: Add ingress-nginx helm repository + kubernetes.core.helm_repository: + url: "{{ ingress_nginx_helm_repo_url }}" + name: "{{ ingress_nginx_helm_repo_name }}" + state: present + +- name: Deploy ingress-nginx helm chart + kubernetes.core.helm: + chart_ref: "{{ ingress_nginx_helm_name }}" + chart_version: "{{ ingress_nginx_helm_version }}" + release_name: "{{ ingress_nginx_helm_release_name }}" + state: present + namespace: "{{ ingress_nginx_namespace }}" + create_namespace: true + wait: true + wait_timeout: 3m + values_files: + - "{{ ingress_nginx_dir }}/helm_values.yaml" diff --git a/roles/ingress_nginx_install/templates/helm_values.yaml.j2 b/roles/ingress_nginx_install/templates/helm_values.yaml.j2 new file mode 100644 index 00000000..751e430e --- /dev/null +++ b/roles/ingress_nginx_install/templates/helm_values.yaml.j2 @@ -0,0 +1,32 @@ +## ingress-nginx Helm chart configuration +## Ref: https://github.com/kubernetes/ingress-nginx/blob/main/charts/ingress-nginx/values.yaml +## + +# -- Override the deployment namespace; defaults to .Release.Namespace +namespaceOverride: {{ ingress_nginx_namespace }} + +commonLabels: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/part-of: ingress-nginx + +controller: + ## nginx configuration + ## Ref: https://github.com/kubernetes/ingress-nginx/blob/main/docs/user-guide/nginx-configuration/index.md + ## + config: + use-forwarded-headers: "true" + ssl-ciphers: "AES128-CCM-SHA256:CHACHA20-POLY1305-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256" + ssl-protocols: "TLSv1.2 TLSv1.3" + priorityClassName: "k8s-cluster-critical" + # -- Bare-metal considerations via the host network https://kubernetes.github.io/ingress-nginx/deploy/baremetal/#via-the-host-network + # Ingress status was blank because there is no Service exposing the Ingress-Nginx Controller in a configuration using the host network, the default --publish-service flag used in standard cloud setups does not apply + reportNodeInternalIp: false + # -- Use a `DaemonSet` or `Deployment` + service: + type: NodePort + nodePorts: + http: {{ ingress_nodeport_http }} + https: {{ ingress_nodeport_https }} + ## Set external traffic policy to: "Local" to preserve source IP on providers supporting it. + ## Ref: https://kubernetes.io/docs/tutorials/services/source-ip/#source-ip-for-services-with-typeloadbalancer + externalTrafficPolicy: Local diff --git a/roles/install_dpdk/tasks/install_dpdk_meson.yml b/roles/install_dpdk/tasks/install_dpdk_meson.yml index b670daae..7614ac40 100644 --- a/roles/install_dpdk/tasks/install_dpdk_meson.yml +++ b/roles/install_dpdk/tasks/install_dpdk_meson.yml @@ -17,7 +17,7 @@ - name: install dpdk-devel required for libraries enablement in RHEL / Rocky dnf: name: dpdk-devel - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.2' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.2', '>=') - name: build and install dpdk using meson and ninja tools block: diff --git a/roles/install_dpdk/tasks/main.yml b/roles/install_dpdk/tasks/main.yml index 8c3cc0a7..0812227e 100644 --- a/roles/install_dpdk/tasks/main.yml +++ b/roles/install_dpdk/tasks/main.yml @@ -82,6 +82,12 @@ when: dpdk_local_patches_dir is defined +- name: Apply DPDK patches for Infrastructure Power manager (IPM) + ansible.builtin.include_role: + name: infrastructure_power_manager + tasks_from: dpdk_patch.yml + when: infrastructure_power_manager_enabled | default(false) + - name: Apply DPDK patches for media transport library (IMTL) ansible.builtin.include_role: name: imtl_install diff --git a/roles/intel_base_container/defaults/main.yml b/roles/intel_base_container/defaults/main.yml index e83a373a..0ad4c625 100644 --- a/roles/intel_base_container/defaults/main.yml +++ b/roles/intel_base_container/defaults/main.yml @@ -19,7 +19,8 @@ base_container_dockerfile_path: "{{ (base_container_path, 'dockerfile') | path_j base_container_test_path: "{{ (base_container_path, 'test') | path_join }}" base_container_sudo: true -aibox_base_container_version: 3.1 +aibox_base_container_version: 4.0 +vss_base_container_version: 24.1 aibox_base_container_set: - { name: aibox-base, @@ -161,3 +162,37 @@ aibox_base_container_test_set: testname: test_opencv.sh, username: aibox } + +vss_base_container_set: + - { name: vss-dlstreamer, + base: ubuntu, + base_version: 22.04, + filename: Dockerfile.dlstreamer, + buildname: build_dlstreamer.sh, + username: vss, + adduser: true, + gpu_stack: true, + gpu_stack_version: default, + oneapi_base: false, + oneapi_ai: false, + openvino: true, + openvino_dev: true, + openvino_version: 2023.0.0, + dlstreamer: true, + dlstreamer_version: 2023.0.0, + ffmpeg: false, + ffmpeg_version: default, + opencv: false, + opencv_version: default, + dpdk: false, + } + +vss_base_container_test_set: + - { name: test-vss-dlstreamer, + base: vss-dlstreamer, + base_version: "{{ vss_base_container_version }}", + filename: Dockerfile.test-dlstreamer, + entryname: test_dlstreamer_entry.sh, + testname: test_dlstreamer.sh, + username: vss + } diff --git a/roles/intel_base_container/files/install_ffmpeg.sh b/roles/intel_base_container/files/install_ffmpeg.sh index 21fc2f0e..af1508ed 100755 --- a/roles/intel_base_container/files/install_ffmpeg.sh +++ b/roles/intel_base_container/files/install_ffmpeg.sh @@ -40,8 +40,8 @@ apt-get install -y \ libvpl-dev \ FFMPEG_BUILD_DIR=ffmpeg_build -FFMPEG_UPSTREAM_VERSION=9b6d191 -FFMPEG_PATCH_VERSION=2023q2 +FFMPEG_UPSTREAM_VERSION=9e1ea3c +FFMPEG_PATCH_VERSION=2023q3 FFMPEG_INSTALL_PREFIX=/usr/local mkdir -p $FFMPEG_BUILD_DIR diff --git a/roles/intel_base_container/files/install_gpu_stack.sh b/roles/intel_base_container/files/install_gpu_stack.sh index 1971f037..0f3d10bc 100755 --- a/roles/intel_base_container/files/install_gpu_stack.sh +++ b/roles/intel_base_container/files/install_gpu_stack.sh @@ -14,45 +14,45 @@ gpg --dearmor < /tmp/intel-graphics.key > /usr/share/keyrings/intel-graphics.gpg if [ "${GPU_TYPE}" = "Flex" ]; then rm -f /etc/apt/sources.list.d/intel-gpu-jammy.list echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" >> /etc/apt/sources.list.d/intel-gpu-jammy.list - apt-get update + apt-get update echo "install server gpu stack" # install umd and runtime packages for server GPU apt-get install -y --allow-downgrades \ - intel-opencl-icd=23.22.26516.29-682~22.04 \ - intel-level-zero-gpu=1.3.26516.29-682~22.04 \ + intel-opencl-icd=23.22.26516.34-682~22.04 \ + intel-level-zero-gpu=1.3.26516.34-682~22.04 \ level-zero=1.11.0-649~22.04 \ - intel-media-va-driver-non-free=23.2.4-678~22.04 \ - libmfx1=23.2.2-678~22.04 \ - libmfxgen1=23.2.4-678~22.04 \ - libvpl2=2023.3.0.0-678~22.04 \ - libegl-mesa0=23.2.0.20230712.1-2073~22.04 \ - libegl1-mesa=23.2.0.20230712.1-2073~22.04 \ - libegl1-mesa-dev=23.2.0.20230712.1-2073~22.04 \ - libgbm1=23.2.0.20230712.1-2073~22.04 \ - libgl1-mesa-dev=23.2.0.20230712.1-2073~22.04 \ - libgl1-mesa-dri=23.2.0.20230712.1-2073~22.04 \ - libglapi-mesa=23.2.0.20230712.1-2073~22.04 \ - libgles2-mesa-dev=23.2.0.20230712.1-2073~22.04 \ - libglx-mesa0=23.2.0.20230712.1-2073~22.04 \ + intel-media-va-driver-non-free=23.2.4-682~22.04 \ + libmfx1=23.2.2-682~22.04 \ + libmfxgen1=23.2.4-682~22.04 \ + libvpl2=2023.3.0.0-682~22.04 \ + libegl-mesa0=24.0.0.20231114.1-2088~22.04 \ + libegl1-mesa=24.0.0.20231114.1-2088~22.04 \ + libegl1-mesa-dev=24.0.0.20231114.1-2088~22.04 \ + libgbm1=24.0.0.20231114.1-2088~22.04 \ + libgl1-mesa-dev=24.0.0.20231114.1-2088~22.04 \ + libgl1-mesa-dri=24.0.0.20231114.1-2088~22.04 \ + libglapi-mesa=24.0.0.20231114.1-2088~22.04 \ + libgles2-mesa-dev=24.0.0.20231114.1-2088~22.04 \ + libglx-mesa0=24.0.0.20231114.1-2088~22.04 \ libigdgmm12=22.3.7-678~22.04 \ - libxatracker2=23.2.0.20230712.1-2073~22.04 \ - mesa-va-drivers=23.2.0.20230712.1-2073~22.04 \ - mesa-vdpau-drivers=23.2.0.20230712.1-2073~22.04 \ - mesa-vulkan-drivers=23.2.0.20230712.1-2073~22.04 \ - va-driver-all=2.19.0.2-64~u22.04 + libxatracker2=24.0.0.20231114.1-2088~22.04 \ + mesa-va-drivers=24.0.0.20231114.1-2088~22.04 \ + mesa-vdpau-drivers=24.0.0.20231114.1-2088~22.04 \ + mesa-vulkan-drivers=24.0.0.20231114.1-2088~22.04 \ + va-driver-all=2.20.0.2-75~u22.04 # install dev packages for server GPU apt-get install -y --allow-downgrades \ - libigc1=1.0.14062.15-682~22.04 \ - libigc-dev=1.0.14062.15-682~22.04 \ + libigc1=1.0.14062.19-682~22.04 \ + libigc-dev=1.0.14062.19-682~22.04 \ intel-igc-cm=1.0.202-682~22.04 \ - libigdfcl1=1.0.14062.15-682~22.04 \ - libigdfcl-dev=1.0.14062.15-682~22.04 \ - libigfxcmrt7=23.2.4-678~22.04 \ - libigfxcmrt-dev=23.2.4-678~22.04 \ + libigdfcl1=1.0.14062.19-682~22.04 \ + libigdfcl-dev=1.0.14062.19-682~22.04 \ + libigfxcmrt7=23.2.4-682~22.04 \ + libigfxcmrt-dev=23.2.4-682~22.04 \ level-zero-dev=1.11.0-649~22.04 \ - libvpl-dev=2023.3.0.0-678~22.04 + libvpl-dev=2023.3.0.0-682~22.04 elif [[ "${GPU_TYPE}" = "Arc" || "${GPU_TYPE}" = "iGPU" ]]; then rm -f /etc/apt/sources.list.d/intel-gpu-jammy.list @@ -62,42 +62,42 @@ elif [[ "${GPU_TYPE}" = "Arc" || "${GPU_TYPE}" = "iGPU" ]]; then echo "install client gpu stack" # install umd and runtime packages for client GPU apt-get install -y --allow-downgrades \ - intel-opencl-icd=23.26.26690.36-704~22.04 \ - intel-level-zero-gpu=1.3.26690.36-704~22.04 \ - level-zero=1.12.0-693~22.04 \ - intel-media-va-driver-non-free=23.3.1-704~22.04 \ - libmfx1=23.2.2-704~22.04 \ - libmfxgen1=23.3.1-704~22.04 \ - libvpl2=2023.3.1.0-704~22.04 \ - libegl-mesa0=23.2.0.20230712.1-2073~22.04 \ - libegl1-mesa=23.2.0.20230712.1-2073~22.04 \ - libegl1-mesa-dev=23.2.0.20230712.1-2073~22.04 \ - libgbm1=23.2.0.20230712.1-2073~22.04 \ - libgl1-mesa-dev=23.2.0.20230712.1-2073~22.04 \ - libgl1-mesa-dri=23.2.0.20230712.1-2073~22.04 \ - libglapi-mesa=23.2.0.20230712.1-2073~22.04 \ - libgles2-mesa-dev=23.2.0.20230712.1-2073~22.04 \ - libglx-mesa0=23.2.0.20230712.1-2073~22.04 \ - libigdgmm12=22.3.8-687~22.04 \ - libxatracker2=23.2.0.20230712.1-2073~22.04 \ - mesa-va-drivers=23.2.0.20230712.1-2073~22.04 \ - mesa-vdpau-drivers=23.2.0.20230712.1-2073~22.04 \ - mesa-vulkan-drivers=23.2.0.20230712.1-2073~22.04 \ - va-driver-all=2.19.0.2-66~u22.04 + intel-opencl-icd=23.35.27191.42-775~22.04 \ + intel-level-zero-gpu=1.3.27191.42-775~22.04 \ + level-zero=1.14.0-744~22.04 \ + intel-media-va-driver-non-free=23.4.0-775~22.04 \ + libmfx1=23.2.2-775~22.04 \ + libmfxgen1=23.4.0-775~22.04 \ + libvpl2=2023.3.1.0-775~22.04 \ + libegl-mesa0=24.0.0.20231114.1-2088~22.04 \ + libegl1-mesa=24.0.0.20231114.1-2088~22.04 \ + libegl1-mesa-dev=24.0.0.20231114.1-2088~22.04 \ + libgbm1=24.0.0.20231114.1-2088~22.04 \ + libgl1-mesa-dev=24.0.0.20231114.1-2088~22.04 \ + libgl1-mesa-dri=24.0.0.20231114.1-2088~22.04 \ + libglapi-mesa=24.0.0.20231114.1-2088~22.04 \ + libgles2-mesa-dev=24.0.0.20231114.1-2088~22.04 \ + libglx-mesa0=24.0.0.20231114.1-2088~22.04 \ + libigdgmm12=22.3.12-742~22.04 \ + libxatracker2=24.0.0.20231114.1-2088~22.04 \ + mesa-va-drivers=24.0.0.20231114.1-2088~22.04 \ + mesa-vdpau-drivers=24.0.0.20231114.1-2088~22.04 \ + mesa-vulkan-drivers=24.0.0.20231114.1-2088~22.04 \ + va-driver-all=2.20.0.2-75~u22.04 # install dev packages for client GPU apt-get install -y --allow-downgrades \ - libigc1=1.0.14508.23-704~22.04 \ - libigc-dev=1.0.14508.23-704~22.04 \ - intel-igc-cm=1.0.206-704~22.04 \ - libigdfcl1=1.0.14508.23-704~22.04 \ - libigdfcl-dev=1.0.14508.23-704~22.04 \ - libigfxcmrt7=23.3.1-704~22.04 \ - libigfxcmrt-dev=23.3.1-704~22.04 \ - level-zero-dev=1.12.0-693~22.04 \ - libvpl-dev=2023.3.1.0-704~22.04 + libigc1=1.0.15136.24-775~22.04 \ + libigc-dev=1.0.15136.24-775~22.04 \ + intel-igc-cm=1.0.206-775~22.04 \ + libigdfcl1=1.0.15136.24-775~22.04 \ + libigdfcl-dev=1.0.15136.24-775~22.04 \ + libigfxcmrt7=23.4.0-775~22.04 \ + libigfxcmrt-dev=23.4.0-775~22.04 \ + level-zero-dev=1.14.0-744~22.04 \ + libvpl-dev=2023.3.1.0-775~22.04 -else +else echo "Unknown GPU, no gpu stack will be installed" fi @@ -108,4 +108,5 @@ apt-get install -y --allow-downgrades \ vainfo \ clinfo \ mesa-utils \ - vulkan-tools + vulkan-tools \ + onevpl-tools diff --git a/roles/intel_base_container/tasks/main.yml b/roles/intel_base_container/tasks/main.yml index 0f6bd836..28c38d00 100644 --- a/roles/intel_base_container/tasks/main.yml +++ b/roles/intel_base_container/tasks/main.yml @@ -28,7 +28,7 @@ debug: msg: "Base container set for : {{ profile_name }} " -- name: Specify base container set +- name: Specify AIBOX base container set set_fact: base_container_version: "{{ aibox_base_container_version }}" base_container_set: "{{ aibox_base_container_set }}" @@ -36,6 +36,14 @@ when: - profile_name == "on_prem_aibox" +- name: Specify VSS base container set + set_fact: + base_container_version: "{{ vss_base_container_version }}" + base_container_set: "{{ vss_base_container_set }}" + base_container_test_set: "{{ vss_base_container_test_set }}" + when: + - profile_name in ["on_prem_vss", "on_prem"] + - name: Generate base container Dockerfiles vars: container_name: "{{ item.name }}" @@ -133,3 +141,36 @@ - 'test_openvino_dev_entry.sh' - 'test_data' become: "{{ base_container_sudo }}" + +- name: Build base container images with docker + community.docker.docker_image: + build: + path: "{{ (base_container_dockerfile_path) | path_join }}" + dockerfile: "{{ item.filename }}" + args: + http_proxy: "{{ http_proxy }}" + https_proxy: "{{ https_proxy }}" + name: "{{ item.name }}" + tag: "{{ base_container_version }}" + push: false + source: build + with_items: "{{ base_container_set }}" + when: + - build_base_images | default(false) | bool + - container_runtime is in ['docker'] + - inventory_hostname == groups['kube_node'][0] + +- name: prepare base container images with containerd + containers.podman.podman_image: + name: "{{ item.name }}" + tag: "{{ base_container_version }}" + path: "{{ (base_container_dockerfile_path) | path_join }}" + build: + file: "{{ item.filename }}" + extra_args: "--build-arg http_proxy={{ http_proxy }} --build-arg https_proxy={{ https_proxy }}" + push: false + with_items: "{{ base_container_set }}" + when: + - build_base_images | default(false) | bool + - container_runtime is in ['containerd'] + - inventory_hostname == groups['kube_node'][0] diff --git a/roles/intel_csl_excat/tasks/main.yml b/roles/intel_csl_excat/tasks/main.yml index e3471bd6..0c42334f 100644 --- a/roles/intel_csl_excat/tasks/main.yml +++ b/roles/intel_csl_excat/tasks/main.yml @@ -70,6 +70,7 @@ ansible.builtin.command: cmd: "./gencerts.sh certs {{ csl_excat_admission_name }} csl-excat 365" chdir: "{{ (csl_home, 'deployments/helm') | path_join }}" + changed_when: true when: inventory_hostname == groups['kube_control_plane'][0] - name: mark kube node who supports excat diff --git a/roles/intel_csl_excat/tasks/preflight.yml b/roles/intel_csl_excat/tasks/preflight.yml index d2d302f8..fdf73abc 100644 --- a/roles/intel_csl_excat/tasks/preflight.yml +++ b/roles/intel_csl_excat/tasks/preflight.yml @@ -31,14 +31,14 @@ - name: assert if no tar ball assert: - that: "provided_csl.stat.checksum | default('')" + that: provided_csl.stat.checksum | default('') msg: - File {{ csl_excat_tar_staging_location }} on localhost does not exist. - Please refer to group_vars/all.yml and search intel_csl_excat for how to get this tar ball. - name: check the csl_excat image integrity assert: - that: "provided_csl.stat.checksum | default('') == '{{ csl_excat_image_checksum }}'" + that: provided_csl.stat.checksum | default('') == csl_excat_image_checksum msg: - File {{ csl_excat_tar_staging_location }} on localhost is NOT the expected one. - Please provide the correct file. diff --git a/roles/intel_dp_operator/defaults/main.yml b/roles/intel_dp_operator/defaults/main.yml index 02a8ec50..b9e4a118 100644 --- a/roles/intel_dp_operator/defaults/main.yml +++ b/roles/intel_dp_operator/defaults/main.yml @@ -15,7 +15,7 @@ ## --- intel_dp_git_url: "https://github.com/intel/intel-device-plugins-for-kubernetes.git" -intel_dp_version: "0.26.0" +intel_dp_version: "0.28.0" intel_dp_dir: "{{ (project_root_dir, 'intel-device-plugins') | path_join }}" intel_dp_templates_dir: "{{ (project_root_dir, 'intel-device-plugins-templates') | path_join }}" intel_dp_namespace: kube-system diff --git a/roles/intel_dp_operator/tasks/add_dp_labels.yml b/roles/intel_dp_operator/tasks/add_dp_labels.yml index c2e7b9da..3912c56f 100644 --- a/roles/intel_dp_operator/tasks/add_dp_labels.yml +++ b/roles/intel_dp_operator/tasks/add_dp_labels.yml @@ -14,21 +14,45 @@ ## limitations under the License. ## --- -- name: add labels for nodes with configured QAT - command: kubectl label nodes {{ hostvars[node_name]['ansible_hostname'] }} qat.configured=true --overwrite +- name: Apply labels for nodes with configured QAT + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Node + metadata: + name: "{{ hostvars[node_name]['ansible_hostname'] }}" + labels: + qat.configured: 'true' when: - qat_dp_enabled | default(false) - hostvars[node_name]['configure_qat'] | default(false) - hostvars[node_name]['qat_devices'] | length > 0 -- name: add labels for nodes with configured SGX - command: kubectl label nodes {{ hostvars[node_name]['ansible_hostname'] }} sgx.configured=true --overwrite +- name: Apply labels for nodes with configured SGX + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Node + metadata: + name: "{{ hostvars[node_name]['ansible_hostname'] }}" + labels: + sgx.configured: 'true' when: - sgx_dp_enabled | default(false) - hostvars[node_name]['configure_sgx'] | default(false) -- name: add labels for nodes with configured DSA - command: kubectl label nodes {{ hostvars[node_name]['ansible_hostname'] }} dsa.configured=true --overwrite +- name: Apply labels for nodes with configured DSA + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Node + metadata: + name: "{{ hostvars[node_name]['ansible_hostname'] }}" + labels: + dsa.configured: 'true' when: - dsa_dp_enabled | default(false) - hostvars[node_name]['configure_dsa_devices'] | default(false) diff --git a/roles/intel_eci/tasks/eci_preflight.yml b/roles/intel_eci/tasks/eci_preflight.yml index a92c94af..c6dbd7ab 100644 --- a/roles/intel_eci/tasks/eci_preflight.yml +++ b/roles/intel_eci/tasks/eci_preflight.yml @@ -16,7 +16,7 @@ --- - name: validate linux distro version for Intel ECI ansible.builtin.assert: - that: ansible_distribution == 'Ubuntu' and ansible_distribution_version == '22.04' + that: ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('22.04', '==') fail_msg: "Intel ECI is supported only on Ubuntu 22.04 ({{ ansible_distribution }} {{ ansible_distribution_version }} is not supported)" success_msg: "Assertion passed. Intel ECI is supported and can be deployed on target with Ubuntu 22.04" @@ -39,7 +39,7 @@ - name: validate Intel ECI repo ansible.builtin.assert: - that: intel_eci_repo_checksum == "{{ intel_eci_repo | checksum }}" + that: intel_eci_repo_checksum == (intel_eci_repo | checksum) msg: - Please configure intel_eci_repo in group vars. - Please contact eci-support@intel.com on how to access this repo. diff --git a/roles/intel_ethernet_operator/defaults/main.yml b/roles/intel_ethernet_operator/defaults/main.yml index 6bd8140b..144c3d93 100644 --- a/roles/intel_ethernet_operator/defaults/main.yml +++ b/roles/intel_ethernet_operator/defaults/main.yml @@ -55,3 +55,5 @@ intel_ethernet_operator_flow_config_rules_dir: "{{ (intel_ethernet_operator_flow intel_ethernet_operator_fw_url: "https://downloadmirror.intel.com/786047/E810_NVMUpdatePackage_v4_30_Linux.tar.gz" intel_ethernet_operator_fw_sum: "993d79ac623b71c5378855738917495a0fa8ffb8" + +ieo_catalog_name: operatorhubio-catalog # gets overriden when local build is enabled diff --git a/roles/intel_ethernet_operator/tasks/cache_server.yml b/roles/intel_ethernet_operator/tasks/cache_server.yml index 0fafe256..5f88c830 100644 --- a/roles/intel_ethernet_operator/tasks/cache_server.yml +++ b/roles/intel_ethernet_operator/tasks/cache_server.yml @@ -42,6 +42,12 @@ - name: Prepare Cache server when: '"True" in is_ddp_update_enabled or "True" in is_fw_update_enabled' block: + - name: Create Intel Ethernet Operator directory to store yaml files + ansible.builtin.file: + path: "{{ intel_ethernet_operator_packages_dir }}" + state: directory + mode: '0750' + - name: Download DDP packages vars: nic_driver: ice @@ -67,48 +73,48 @@ COPY packages /usr/share/nginx/html - block: - - name: build cache webserver image + - name: build cache webserver image - docker ansible.builtin.command: docker build -t {{ intel_ethernet_operator_cache_webserver_image }} -f Dockerfile . changed_when: true args: chdir: "{{ intel_ethernet_operator_files_dir }}" - - name: push cache webserver image + - name: push cache webserver image - docker ansible.builtin.command: docker push {{ intel_ethernet_operator_cache_webserver_image }} changed_when: true when: - '"docker" in container_runtime' - block: - - name: build cache webserver image + - name: build cache webserver image - podman ansible.builtin.command: podman build -t {{ intel_ethernet_operator_cache_webserver_image }} -f Dockerfile . changed_when: true args: chdir: "{{ intel_ethernet_operator_files_dir }}" - - name: push cache webserver image + - name: push cache webserver image - podman ansible.builtin.command: podman push {{ intel_ethernet_operator_cache_webserver_image }} changed_when: true when: - '"docker" not in container_runtime' - - name: populate cache webserver yaml files and push to controller node + - name: Template cache webserver resources ansible.builtin.template: - src: "{{ item.src }}" - dest: "{{ (intel_ethernet_operator_files_dir, item.dst) | path_join }}" - force: yes - mode: preserve + src: "{{ item.template }}" + force: true + dest: "{{ (intel_ethernet_operator_files_dir, item.name) | path_join }}" + mode: '0640' loop: - - {src: 'cache-server.yml.j2', dst: 'cache-server.yml'} - - {src: 'cache-server-svc.yml.j2', dst: 'cache-server-svc.yml'} + - { name: 'cache-server.yaml', template: 'cache-server.yaml.j2' } + - { name: 'cache-server-svc.yaml', template: 'cache-server-svc.yaml.j2' } - name: deploy cache webserver kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_files_dir, item) | path_join }}" + template: "{{ item }}" loop: - - cache-server.yml - - cache-server-svc.yml + - cache-server.yaml.j2 + - cache-server-svc.yaml.j2 - name: check if all pods are running ansible.builtin.shell: "set -o pipefail && kubectl get pods -n {{ intel_ethernet_operator_namespace }} | awk 'NR != 1 { print $3 }'" diff --git a/roles/intel_ethernet_operator/tasks/ddp.yml b/roles/intel_ethernet_operator/tasks/ddp.yml index 076fabee..17cc8800 100644 --- a/roles/intel_ethernet_operator/tasks/ddp.yml +++ b/roles/intel_ethernet_operator/tasks/ddp.yml @@ -14,6 +14,12 @@ ## limitations under the License. ## --- +- name: Create Intel Ethernet Operator directory to store yaml files + ansible.builtin.file: + path: "{{ intel_ethernet_operator_ddp_files_dir }}" + state: directory + mode: '0750' + - name: Find DDP packages atributes ansible.builtin.find: path: "{{ intel_ethernet_operator_packages_dir }}" @@ -25,15 +31,15 @@ block: - name: Populate Intel Ethernet Operator yaml files and push to controller node ansible.builtin.template: - src: "ddp-update.yml.j2" - dest: "{{ (intel_ethernet_operator_ddp_files_dir, node_name + '-ddp-update.yml') | path_join }}" + src: "ddp-update.yaml.j2" + dest: "{{ (intel_ethernet_operator_ddp_files_dir, node_name + '-ddp-update.yaml') | path_join }}" force: yes mode: preserve - name: Apply DDP update on {{ node_name }} kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_ddp_files_dir, node_name + '-ddp-update.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_ddp_files_dir, node_name + '-ddp-update.yaml') | path_join }}" - name: Wait for start of node reboot ansible.builtin.wait_for: @@ -88,7 +94,7 @@ - name: Remove DDP CR after update kubernetes.core.k8s: - src: "{{ (intel_ethernet_operator_ddp_files_dir, node_name + '-ddp-update.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_ddp_files_dir, node_name + '-ddp-update.yaml') | path_join }}" state: absent - name: Reload nic modules diff --git a/roles/intel_ethernet_operator/tasks/ethernet_operator.yml b/roles/intel_ethernet_operator/tasks/ethernet_operator.yml index eda1c982..810a5286 100644 --- a/roles/intel_ethernet_operator/tasks/ethernet_operator.yml +++ b/roles/intel_ethernet_operator/tasks/ethernet_operator.yml @@ -14,103 +14,109 @@ ## limitations under the License. ## --- -- name: clone Intel Ethernet Operator repository - ansible.builtin.git: - repo: "{{ intel_ethernet_operator_git }}" - version: "{{ intel_ethernet_operator_git_ref }}" - dest: "{{ intel_ethernet_operator_dir }}" - force: true - -- name: get GOPATH - ansible.builtin.command: go env GOPATH - register: gopath - changed_when: false - -- name: build Intel Ethernet Operator - vars: - # in case of RHEL 9.2 & podman, SYS_CHROOT is missing from default caps. - # no need to limit only to RHEL OS, no impact to Ubuntu - podman_build_args: "--cap-add SYS_CHROOT" - community.general.make: - target: "{{ item }}" - chdir: "{{ intel_ethernet_operator_dir }}" - params: - VERSION: "{{ intel_ethernet_operator_build_version }}" - IMAGE_REGISTRY: "{{ registry_local_address }}" - IMGTOOL: "{{ 'docker' if container_runtime == 'docker' else 'podman' }}" - TLS_VERIFY: "{{ intel_ethernet_operator_make_tls }}" - TARGET_PLATFORM: "{{ intel_ethernet_operator_target_platform }}" - UFT_IMAGE: "{{ uft_image }}:{{ uft_image_ver }}" - DOCKERARGS: "{{ podman_build_args if container_runtime in ['crio', 'containerd'] else omit }}" - register: ieo_build_status - retries: 5 - delay: 120 - until: - - "'Github rate-limiter failed the request.' not in ieo_build_status.stdout" - environment: - PATH: "{{ gopath.stdout }}/bin:/usr/local/go/bin:/usr/sbin:/usr/bin:/sbin:/bin" - loop: - - build_all - - push_all - - catalog-build - - catalog-push - -- name: create Intel Ethernet Operator directory to store yaml files - ansible.builtin.file: - path: "{{ item }}" - state: directory - mode: '0644' - loop: - - "{{ intel_ethernet_operator_files_dir }}" - - "{{ intel_ethernet_operator_packages_dir }}" - - "{{ intel_ethernet_operator_fw_files_dir }}" - - "{{ intel_ethernet_operator_ddp_files_dir }}" - - "{{ intel_ethernet_operator_flow_config_files_dir }}" - - "{{ intel_ethernet_operator_flow_config_rules_dir }}" - -- name: create Intel Ethernet Operator namespace +- name: Create Intel Ethernet Operator namespace kubernetes.core.k8s: name: "{{ intel_ethernet_operator_namespace }}" kind: Namespace state: present -- name: populate Intel Ethernet Operator yaml files and push to controller node +- name: Create IEO templates directory + ansible.builtin.file: + state: directory + path: "{{ intel_ethernet_operator_files_dir }}" + mode: '0750' + +- name: Create Intel Ethernet Operator Catalog from source + when: intel_ethernet_operator_local_build | default(false) + block: + - name: Clone Intel Ethernet Operator repository + ansible.builtin.git: + repo: "{{ intel_ethernet_operator_git }}" + version: "{{ intel_ethernet_operator_git_ref }}" + dest: "{{ intel_ethernet_operator_dir }}" + force: true + + - name: Get GOPATH + ansible.builtin.command: go env GOPATH + register: gopath + changed_when: false + + - name: Build Intel Ethernet Operator + vars: + # in case of RHEL 9.2 & podman, SYS_CHROOT is missing from default caps. + # no need to limit only to RHEL OS, no impact to Ubuntu + podman_build_args: "--cap-add SYS_CHROOT" + community.general.make: + target: "{{ item }}" + chdir: "{{ intel_ethernet_operator_dir }}" + params: + VERSION: "{{ intel_ethernet_operator_build_version }}" + IMAGE_REGISTRY: "{{ registry_local_address }}" + IMGTOOL: "{{ 'docker' if container_runtime == 'docker' else 'podman' }}" + TLS_VERIFY: "{{ intel_ethernet_operator_make_tls }}" + TARGET_PLATFORM: "{{ intel_ethernet_operator_target_platform }}" + UFT_IMAGE: "{{ uft_image }}:{{ uft_image_ver }}" + DOCKERARGS: "{{ podman_build_args if container_runtime in ['crio', 'containerd'] else omit }}" + register: ieo_build_status + retries: 5 + delay: 120 + until: + - "'Github rate-limiter failed the request.' not in ieo_build_status.stdout" + environment: + PATH: "{{ gopath.stdout }}/bin:/usr/local/go/bin:/usr/sbin:/usr/bin:/sbin:/bin" + loop: + - build_all + - push_all + - catalog-build + - catalog-push + + - name: Set local IEO catalog name + ansible.builtin.set_fact: + ieo_catalog_name: local-ieo-catalog + + - name: Template Catalog resource to target machine + ansible.builtin.template: + src: "catalog.yaml.j2" + force: yes + dest: "{{ (intel_ethernet_operator_files_dir, 'catalog.yaml') | path_join }}" + mode: '0640' + + - name: Deploy local IEO catalog + kubernetes.core.k8s: + state: present + template: "catalog.yaml.j2" + + - name: Wait for Catalog source to be running + kubernetes.core.k8s_info: + api_version: "operators.coreos.com/v1alpha1" + kind: CatalogSource + namespace: olm + name: "{{ ieo_catalog_name }}" + register: catalog_info + retries: 12 + delay: 5 + until: | + catalog_info.resources is defined and + catalog_info.resources | length != 0 and + catalog_info.resources[0].status.connectionState.lastObservedState | default("NOTREADY") == "READY" + + +- name: Template Subscription & OperatorGroup resources to target machine ansible.builtin.template: - src: "{{ item.src }}" - dest: "{{ (intel_ethernet_operator_files_dir, item.dst) | path_join }}" - force: true - mode: preserve + src: "{{ item.template }}" + dest: "{{ (intel_ethernet_operator_files_dir, item.name) | path_join }}" + mode: '0640' loop: - - {src: 'catalog.yml.j2', dst: 'catalog.yml'} - - {src: 'operator-group.yml.j2', dst: 'operator-group.yml'} - - {src: 'subscription.yml.j2', dst: 'subscription.yml'} + - { name: 'operator-group.yaml', template: 'operator-group.yaml.j2' } + - { name: 'subscription.yaml', template: 'subscription.yaml.j2' } -- name: Deploy Catalog source and OperatorGroup +- name: Deploy Intel Ethernet Operator via Subscription & OperatorGroup kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_files_dir, item) | path_join }}" + template: "{{ item }}" loop: - - catalog.yml - - operator-group.yml - -- name: wait for Catalog source to be running - kubernetes.core.k8s_info: - api_version: "operators.coreos.com/v1alpha1" - kind: CatalogSource - namespace: "{{ intel_ethernet_operator_namespace }}" - name: intel-ethernet-operators - register: catalog_info - retries: 12 - delay: 5 - until: | - catalog_info.resources is defined and - catalog_info.resources | length != 0 and - catalog_info.resources[0].status.connectionState.lastObservedState | default("NOTREADY") == "READY" - -- name: create Subscription - kubernetes.core.k8s: - state: present - src: "{{ (intel_ethernet_operator_files_dir, 'subscription.yml') | path_join }}" + - operator-group.yaml.j2 + - subscription.yaml.j2 - name: wait for Ethernet Operator deployment kubernetes.core.k8s_info: diff --git a/roles/intel_ethernet_operator/tasks/flow_config_deployment.yml b/roles/intel_ethernet_operator/tasks/flow_config_deployment.yml index 1b8b0309..cb1552db 100644 --- a/roles/intel_ethernet_operator/tasks/flow_config_deployment.yml +++ b/roles/intel_ethernet_operator/tasks/flow_config_deployment.yml @@ -14,6 +14,15 @@ ## limitations under the License. ## --- +- name: Create Intel Ethernet Operator directory to store yaml files + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: '0750' + loop: + - "{{ intel_ethernet_operator_flow_config_files_dir }}" + - "{{ intel_ethernet_operator_flow_config_rules_dir }}" + - name: flow config files ansible.builtin.include_tasks: flow_config_files.yml loop: "{{ groups['kube_node'] }}" @@ -29,18 +38,18 @@ force: yes mode: preserve loop: - - {src: 'flow-config-sriov-network.yml.j2', dst: 'flow-config-sriov-network.yml'} - - {src: 'flow-config-node-agent.yml.j2', dst: 'flow-config-node-agent.yml'} + - {src: 'flow-config-sriov-network.yaml.j2', dst: 'flow-config-sriov-network.yaml'} + - {src: 'flow-config-node-agent.yaml.j2', dst: 'flow-config-node-agent.yaml'} - name: create SRIOV network attachment definition for the DCF VF pool kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_flow_config_files_dir, 'flow-config-sriov-network.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_flow_config_files_dir, 'flow-config-sriov-network.yaml') | path_join }}" - name: create FlowConfig Node Agent deployment CR kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_flow_config_files_dir, 'flow-config-node-agent.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_flow_config_files_dir, 'flow-config-node-agent.yaml') | path_join }}" - name: wait for FlowConfig Daemon ansible.builtin.pause: # TODO replace with condition wait, not simple as there are no labels in flowconfig daemon pods diff --git a/roles/intel_ethernet_operator/tasks/flow_config_files.yml b/roles/intel_ethernet_operator/tasks/flow_config_files.yml index 1d17b5ca..2c7182bb 100644 --- a/roles/intel_ethernet_operator/tasks/flow_config_files.yml +++ b/roles/intel_ethernet_operator/tasks/flow_config_files.yml @@ -25,14 +25,14 @@ force: yes mode: preserve loop: - - {src: 'flow-config-sriov-policy.yml.j2', dst: 'flow-config-sriov-policy.yml', create: true } - - {src: 'flow-config-node-flow.yml.j2', dst: 'flow-config-node-flow.yml', create: "{{ node_flow_config }}" } + - {src: 'flow-config-sriov-policy.yaml.j2', dst: 'flow-config-sriov-policy.yaml', create: true } + - {src: 'flow-config-node-flow.yaml.j2', dst: 'flow-config-node-flow.yaml', create: "{{ node_flow_config }}" } when: item.create | bool - name: apply SRIOV Network Node Policy for Flow Config kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_flow_config_files_dir, node_name + '-flow-config-sriov-policy.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_flow_config_files_dir, node_name + '-flow-config-sriov-policy.yaml') | path_join }}" - name: wait for SRIOV Network Node Policy ansible.builtin.pause: diff --git a/roles/intel_ethernet_operator/tasks/flow_config_rules.yml b/roles/intel_ethernet_operator/tasks/flow_config_rules.yml index db8994be..9f1cb13e 100644 --- a/roles/intel_ethernet_operator/tasks/flow_config_rules.yml +++ b/roles/intel_ethernet_operator/tasks/flow_config_rules.yml @@ -16,44 +16,34 @@ --- - name: prepare Flow Configuration rules if provided block: - - name: check if local Flow Configuration rules directory exists - ansible.builtin.stat: - path: "{{ hostvars[node_name]['intel_ethernet_operator']['flow_config_dir'] }}" - delegate_to: "localhost" - become: false - register: flow_config_rules_directory - - name: find available Flow Configuration rules manifests ansible.builtin.find: paths: "{{ hostvars[node_name]['intel_ethernet_operator']['flow_config_dir'] }}" recurse: yes delegate_to: "localhost" become: false - when: - - flow_config_rules_directory.stat.exists | bool - - flow_config_rules_directory.stat.isdir | bool register: flow_config_rules_files_found - - name: copy Flow Configuration rules manifests + - name: Copy Flow Configuration rules manifests ansible.builtin.copy: src: "{{ file.path }}" dest: "{{ (intel_ethernet_operator_flow_config_rules_dir, hostvars[node_name]['inventory_hostname'] + '-' + file.path | basename) | path_join }}" force: yes - owner: root - mode: preserve + owner: "{{ ansible_user }}" + mode: '0640' loop: "{{ flow_config_rules_files_found.files }}" loop_control: loop_var: file when: - - not flow_config_rules_files_found.skipped | default(false) - flow_config_rules_files_found.matched > 0 - - name: create Flow Configuration rules if provided - ansible.builtin.command: "kubectl apply -f ./" - args: - chdir: "{{ intel_ethernet_operator_flow_config_rules_dir }}" + - name: Create Flow Configuration rules if provided + kubernetes.core.k8s: + state: present + apply: true + definition: "{{ lookup('file', item.path) | from_yaml }}" + loop: "{{ flow_config_rules_files_found.files }}" when: - - not flow_config_rules_files_found.skipped | default(false) - flow_config_rules_files_found.matched > 0 when: hostvars[node_name]['intel_ethernet_operator']['flow_config_dir'] is defined diff --git a/roles/intel_ethernet_operator/tasks/fw.yml b/roles/intel_ethernet_operator/tasks/fw.yml index c53c0e94..5b2148e3 100644 --- a/roles/intel_ethernet_operator/tasks/fw.yml +++ b/roles/intel_ethernet_operator/tasks/fw.yml @@ -14,20 +14,26 @@ ## limitations under the License. ## --- +- name: Create Intel Ethernet Operator directory to store yaml files + ansible.builtin.file: + path: "{{ intel_ethernet_operator_fw_files_dir }}" + state: directory + mode: '0750' + - name: Firmware update when: hostvars[node_name]['intel_ethernet_operator']['fw_update'] |d(false) block: - name: Populate Intel Ethernet Operator yaml files and push to controller node ansible.builtin.template: src: "firmware-update.yml.j2" - dest: "{{ (intel_ethernet_operator_fw_files_dir, node_name + '-fw-update.yml') | path_join }}" + dest: "{{ (intel_ethernet_operator_fw_files_dir, node_name + '-fw-update.yaml') | path_join }}" force: yes mode: preserve - name: Apply FW update on {{ node_name }} kubernetes.core.k8s: state: present - src: "{{ (intel_ethernet_operator_fw_files_dir, node_name + '-fw-update.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_fw_files_dir, node_name + '-fw-update.yaml') | path_join }}" - name: Wait for start of node reboot ansible.builtin.wait_for: @@ -78,7 +84,7 @@ - name: Remove FW CR after update kubernetes.core.k8s: state: absent - src: "{{ (intel_ethernet_operator_fw_files_dir, node_name + '-fw-update.yml') | path_join }}" + src: "{{ (intel_ethernet_operator_fw_files_dir, node_name + '-fw-update.yaml') | path_join }}" - name: Remove EthernetNodeConfig after update kubernetes.core.k8s: diff --git a/roles/intel_ethernet_operator/tasks/preflight_ethernet_operator.yml b/roles/intel_ethernet_operator/tasks/preflight_ethernet_operator.yml index 09436381..005c4eee 100644 --- a/roles/intel_ethernet_operator/tasks/preflight_ethernet_operator.yml +++ b/roles/intel_ethernet_operator/tasks/preflight_ethernet_operator.yml @@ -95,3 +95,30 @@ that: not minio_enabled | default(false) msg: "Flow Configuration requires MinIO to be disabled." when: intel_ethernet_operator_flow_config_enabled + + - name: Check if flow config directory exists + block: + - name: Load flow_config_dir stats + ansible.builtin.stat: + path: "{{ hostvars[node_name]['intel_ethernet_operator']['flow_config_dir'] }}" + register: flow_config_dir_stat + delegate_to: localhost + become: false + - name: Check that flow config directory exists + ansible.builtin.assert: + that: flow_config_dir_stat.stat.exists + fail_msg: "Defined directory in parameter intel_ethernet_operator_flow_config_dir does not exist in ansible host." + when: + - intel_ethernet_operator_flow_config_enabled + - hostvars[node_name]['intel_ethernet_operator']['flow_config_dir'] is defined + +- name: Check if Operator is set to be built from soure when flow config is enabled + ansible.builtin.assert: + that: + - intel_ethernet_operator_local_build | default(false) + fail_msg: + - "IEO flow configuration requires Operator to be built from source." + - "Please set intel_ethernet_operator_local_build to 'true' in group_vars." + when: + - intel_ethernet_operator_flow_config_enabled + - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/intel_ethernet_operator/templates/cache-server-svc.yml.j2 b/roles/intel_ethernet_operator/templates/cache-server-svc.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/cache-server-svc.yml.j2 rename to roles/intel_ethernet_operator/templates/cache-server-svc.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/cache-server.yml.j2 b/roles/intel_ethernet_operator/templates/cache-server.yaml.j2 similarity index 91% rename from roles/intel_ethernet_operator/templates/cache-server.yml.j2 rename to roles/intel_ethernet_operator/templates/cache-server.yaml.j2 index 49a01c4f..73f82de5 100644 --- a/roles/intel_ethernet_operator/templates/cache-server.yml.j2 +++ b/roles/intel_ethernet_operator/templates/cache-server.yaml.j2 @@ -26,9 +26,6 @@ spec: nodeSelector: kubernetes.io/hostname: {{ hostvars[groups['kube_control_plane'][0]]['ansible_hostname'] | lower }} tolerations: - - effect: NoSchedule - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists diff --git a/roles/intel_ethernet_operator/templates/catalog.yml.j2 b/roles/intel_ethernet_operator/templates/catalog.yaml.j2 similarity index 66% rename from roles/intel_ethernet_operator/templates/catalog.yml.j2 rename to roles/intel_ethernet_operator/templates/catalog.yaml.j2 index cbdb14a9..198fdcea 100644 --- a/roles/intel_ethernet_operator/templates/catalog.yml.j2 +++ b/roles/intel_ethernet_operator/templates/catalog.yaml.j2 @@ -1,8 +1,8 @@ apiVersion: operators.coreos.com/v1alpha1 kind: CatalogSource metadata: - name: intel-ethernet-operators - namespace: {{ intel_ethernet_operator_namespace }} + name: {{ ieo_catalog_name }} + namespace: olm labels: app: intel-ethernet-operator version: {{ intel_ethernet_operator_git_ref }} @@ -10,4 +10,4 @@ spec: sourceType: grpc image: {{ intel_ethernet_operator_catalog_image }} publisher: Intel - displayName: Intel ethernet operators(Local) + displayName: Intel ethernet operators (Local) diff --git a/roles/intel_ethernet_operator/templates/ddp-update.yml.j2 b/roles/intel_ethernet_operator/templates/ddp-update.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/ddp-update.yml.j2 rename to roles/intel_ethernet_operator/templates/ddp-update.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/firmware-update.yml.j2 b/roles/intel_ethernet_operator/templates/firmware-update.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/firmware-update.yml.j2 rename to roles/intel_ethernet_operator/templates/firmware-update.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/flow-config-node-agent.yml.j2 b/roles/intel_ethernet_operator/templates/flow-config-node-agent.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/flow-config-node-agent.yml.j2 rename to roles/intel_ethernet_operator/templates/flow-config-node-agent.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/flow-config-node-flow.yml.j2 b/roles/intel_ethernet_operator/templates/flow-config-node-flow.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/flow-config-node-flow.yml.j2 rename to roles/intel_ethernet_operator/templates/flow-config-node-flow.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/flow-config-sriov-network.yml.j2 b/roles/intel_ethernet_operator/templates/flow-config-sriov-network.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/flow-config-sriov-network.yml.j2 rename to roles/intel_ethernet_operator/templates/flow-config-sriov-network.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/flow-config-sriov-policy.yml.j2 b/roles/intel_ethernet_operator/templates/flow-config-sriov-policy.yaml.j2 similarity index 100% rename from roles/intel_ethernet_operator/templates/flow-config-sriov-policy.yml.j2 rename to roles/intel_ethernet_operator/templates/flow-config-sriov-policy.yaml.j2 diff --git a/roles/intel_ethernet_operator/templates/operator-group.yml.j2 b/roles/intel_ethernet_operator/templates/operator-group.yaml.j2 similarity index 70% rename from roles/intel_ethernet_operator/templates/operator-group.yml.j2 rename to roles/intel_ethernet_operator/templates/operator-group.yaml.j2 index 7b13b922..e2110815 100644 --- a/roles/intel_ethernet_operator/templates/operator-group.yml.j2 +++ b/roles/intel_ethernet_operator/templates/operator-group.yaml.j2 @@ -3,9 +3,6 @@ kind: OperatorGroup metadata: name: intel-ethernet-operator namespace: {{ intel_ethernet_operator_namespace }} - labels: - app: intel-ethernet-operator - version: {{ intel_ethernet_operator_git_ref }} spec: targetNamespaces: - {{ intel_ethernet_operator_namespace }} diff --git a/roles/intel_ethernet_operator/templates/subscription.yml.j2 b/roles/intel_ethernet_operator/templates/subscription.yaml.j2 similarity index 68% rename from roles/intel_ethernet_operator/templates/subscription.yml.j2 rename to roles/intel_ethernet_operator/templates/subscription.yaml.j2 index 8babe8fb..7e42d5f7 100644 --- a/roles/intel_ethernet_operator/templates/subscription.yml.j2 +++ b/roles/intel_ethernet_operator/templates/subscription.yaml.j2 @@ -1,11 +1,8 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: intel-ethernet-subscription + name: intel-ethernet-operator-sub namespace: {{ intel_ethernet_operator_namespace }} - labels: - app: intel-ethernet-operator - version: {{ intel_ethernet_operator_git_ref }} spec: {% if http_proxy is defined or https_proxy is defined %} config: @@ -19,5 +16,5 @@ spec: {% endif %} channel: alpha name: intel-ethernet-operator - source: intel-ethernet-operators - sourceNamespace: {{ intel_ethernet_operator_namespace }} + source: {{ ieo_catalog_name }} + sourceNamespace: olm diff --git a/roles/intel_flexran/defaults/main.yml b/roles/intel_flexran/defaults/main.yml index 14135fa3..b1f96d5b 100644 --- a/roles/intel_flexran/defaults/main.yml +++ b/roles/intel_flexran/defaults/main.yml @@ -21,34 +21,41 @@ # CR = Custom Resource # ACC100/ACC200 = Intel vRAN Dedicated H/W SRIOV-FEC Accelerator Devices -# - include_role: dpdk -# when: dpdk_dir is not defined # to allow tagged execution dpdk_dir: "{{ (project_root_dir, 'dpdk-' + dpdk_version) | path_join }}" -# Intel FlexRAN -# intel_flexran_repo: "Intel’s Developer Zone Portal aka RDC" -# intel_flexran_token: "not public. pkg access requires NDA. see docs/flexran_guide.md" intel_flexran_staging_location: "/tmp/flexran/" # a directory on localhost (ansible host) -intel_flexran_ver: "23.07" # "22.03" (RA22.06) "22.07" (RA22.08) "22.07.3" (RA22.11) "22.11" (RA23.02) "23.03" (RA23.07) "23.07" (RA23.10) -intel_flexran_pod_version_icx_sp: "22.07" # (RA23.07) -intel_flexran_pod_version_spr_ee: "23.07" # "23.03"(RA23.07) +intel_flexran_ver: "23.11" +intel_flexran_pod_version_icx_sp: "22.07" +intel_flexran_pod_version_spr_ee: "23.07" intel_flexran_namespace: "default" intel_flexran_dir: "{{ (project_root_dir, 'intel-flexran') | path_join }}" -intel_flexran_files_dir: "{{ (project_root_dir, 'intel-flexran-files') | path_join }}" # for FEC ACC CRs, kernel cmdline, etc -intel_flexran_dpdk_ver: "22.11.1" # "21.11" for FlexRAN 22.03, 22.07, 22.07.3, 22.11 +intel_flexran_files_dir: "{{ (project_root_dir, 'intel-flexran-files') | path_join }}" # for FEC ACC CRs, etc +intel_flexran_dpdk_ver: "22.11.1" intel_flexran_dpdk_dir: "{{ (project_root_dir, 'dpdk-' + intel_flexran_dpdk_ver) | path_join }}" -intel_flexran_dpdk_zip: "dpdk_patch-{{ intel_flexran_ver }}.patch.zip" -intel_flexran_dpdk_zip_chk: "dab1a0c3a0530be9904d62d3c3f4f88166b73360dcc11402500070237000f468" # SHA256 for dpdk_patch-22.07.3.patch.zip - intel_flexran_dpdk_patch: "dpdk_patch-{{ intel_flexran_ver }}.patch" -intel_flexran_dpdk_patch_chk: "fd80a13c454cd7930ada3704306d29bf8c3acfef2630b7d410f6c215d32028ff" # SHA256 for dpdk_patch-23.07.patch - -intel_flexran_patch: "FlexRAN-R{{ intel_flexran_ver }}.zip" -intel_flexran_patch_chk: "1089d1bd3d86fe2f2198c497fa26e6f9322fd867f5f6ece087190499ff427593" # SHA256 for FlexRAN-R22.07.3.zip - -intel_pfbb_version: "v23.03" +intel_flexran_dpdk_patch_chk: "d92f5ee0d5ff835550465142c41f3cb6fe9e9731420ae40826b5dbc1586247d9" # SHA256 for dpdk_patch-23.11.patch -inih_version: "r44" +intel_pfbb_version: "v23.11" +pf_bb_download_dir: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" +# pf_bb_config files +pf_bb_cfg: + acc100: + host: "./acc100/acc100_config_pf_4g5g.cfg" + pod: "./acc100/acc100_config_vf_5g.cfg" + acc200: + host: "./acc200/acc200_config_pf_5g.cfg" + pod: "./acc200/acc200_config_vf_5g.cfg" rt_test_repo: "https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git" rt_test_version: "v2.5" + +containerd_conf_file: "{{ '/var/lib/rancher/rke2/agent/etc/containerd/config.toml' if kube_provisioner == 'rke2' else '/etc/containerd/config.toml' }}" + +# flexran pod mode deployment files +flexran_pod: + icx: + timer: "intel_flexran_pod_timer_mode_icx_sp.yaml" + xran: "intel_flexran_pod_xran_mode_icx_sp.yaml" + spr: + timer: "intel_flexran_pod_timer_mode_spr_ee.yaml" + xran: "intel_flexran_pod_xran_mode_spr_ee.yaml" diff --git a/roles/intel_flexran/files/cek_flexran_nic_mac.sh b/roles/intel_flexran/files/cek_flexran_nic_mac.sh new file mode 100644 index 00000000..8b29b388 --- /dev/null +++ b/roles/intel_flexran/files/cek_flexran_nic_mac.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SRIOV_NUMVFS_MAPPINGS=${SRIOV_NUMVFS_MAPPINGS:-"/etc/cek/cek_sriov_numvfs"} + +setup_vfs_mac() { + echo "Setting up VFs MAC for FlexRAN" + if [[ ! -r "${SRIOV_NUMVFS_MAPPINGS}" ]]; then + echo "File ${SRIOV_NUMVFS_MAPPINGS} doesn't exist, no VFs MAC will be configured" + return 0 + fi + + j=0 + while read -r pci_address numvfs interface_name; do + echo "${pci_address}" + for i in $(seq "${numvfs}") + do + ip link set "${interface_name}" vf $((i-1)) mac 00:11:22:33:00:$((i-1))${j} + done + ip link show "${interface_name}" + j=$((j+1)) + done < "${SRIOV_NUMVFS_MAPPINGS}" +} + +setup_vfs_mac diff --git a/roles/intel_flexran/files/cek_sriov_fec_init.sh b/roles/intel_flexran/files/cek_sriov_fec_init.sh new file mode 100644 index 00000000..9784d4d7 --- /dev/null +++ b/roles/intel_flexran/files/cek_sriov_fec_init.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +DEVBIND_TOOL=${DEVBIND_TOOL:-"/usr/local/bin/dpdk-devbind.py"} +IGB_UIO_PATH=${IGB_UIO_PATH:-"/opt/cek/dpdk-kmods/linux/igb_uio/igb_uio.ko"} +### 0000:f7:00.0 acc200 igb_uio 0 +### 0000:f7:00.0 acc200 vfio-pci 1 +CEK_FEC_INFO=${FEC_PF_DRIVER:-"/etc/cek/cek_fec_info"} + +PFBBCONFIG_DIR=${PFBBCONFIG_DIR:-"/opt/cek/intel-flexran/source/pf-bb-config"} +PFBBCONFIG_TOOL=${PFBBCONFIG_TOOL:-"${PFBBCONFIG_DIR}/pf_bb_config"} +CFG_ACC100_HOST=${CFG_ACC100_HOST:-"${PFBBCONFIG_DIR}/acc100/acc100_config_pf_4g5g.cfg"} +CFG_ACC100_POD=${CFG_ACC100_POD:-"${PFBBCONFIG_DIR}/acc100/acc100_config_vf_5g.cfg"} +CFG_ACC200_HOST=${CFG_ACC200_HOST:-"${PFBBCONFIG_DIR}/acc200/acc200_config_pf_5g.cfg"} +CFG_ACC200_POD=${CFG_ACC200_POD:-"${PFBBCONFIG_DIR}/acc200/acc200_config_vf_5g.cfg"} + +configure_fec() { + if [[ ! -r "${CEK_FEC_INFO}" ]]; then + echo "File ${CEK_FEC_INFO} doesn't exist." + return 0 + fi + + while read -r pci_address fec_type pf_driver vf_num; do + if [[ ${pci_address} == "" ]] || [[ ${fec_type} == "" ]] || [[ ${pf_driver} == "" ]] || [[ ${vf_num} == "" ]]; then + echo "Empty PCI address or FEC type or pf driver or vf num, skipping..." + continue + fi + + if [[ ${pf_driver} == "igb_uio" ]]; then + numvfs_path="/sys/bus/pci/devices/${pci_address}/max_vfs" + else + numvfs_path="/sys/bus/pci/devices/${pci_address}/sriov_numvfs" + fi + + # reset FEC device + echo "Cleaning configuration on ${pci_address}" + # In case of VFIO mode, pf_bb_config runs daemon mode, to reconfigure first kill the existing pf_bb_config process + if [[ ${pf_driver} == "vfio-pci" ]]; then + pkill pf_bb_config + fi + + if [[ -e "${numvfs_path}" ]]; then + echo 0 > "${numvfs_path}" + fi + $DEVBIND_TOOL -u "${pci_address}" + + # Load PF driver + if [[ -d "/sys/bus/pci/drivers/${pf_driver}" ]]; then + echo "PF driver: ${pf_driver} already exists, do nothing." + else + echo "Bind PF driver: ${pf_driver}" + if [[ ${pf_driver} == "igb_uio" ]]; then + if [[ ! -e "${IGB_UIO_PATH}" ]]; then + echo " ${IGB_UIO_PATH} doesn't exist." + return 0 + fi + modprobe uio + insmod "${IGB_UIO_PATH}" + else + modprobe "${pf_driver}" + fi + fi + + # Bind PF to driver + $DEVBIND_TOOL -b "${pf_driver}" "${pci_address}" + + # Create VF if needed + if [[ ${vf_num} -ne 0 ]]; then + echo 0 > "${numvfs_path}" + echo "${vf_num}" > "${numvfs_path}" + + # Bind VF to driver + vf_pci_addrs="$(lspci | grep -i acc | grep -i -e 0d5d -e 57c1 | cut -f1 -d' ')" + for vf_pci_addr in ${vf_pci_addrs} + do + $DEVBIND_TOOL -b vfio-pci "${vf_pci_addr}" + done + fi + + # pfBBConfig + cd "${PFBBCONFIG_DIR}" || exit + if [[ ${fec_type} == "acc100" ]]; then + if [[ ${vf_num} -eq 0 ]]; then + ${PFBBCONFIG_TOOL} "${fec_type}" -c "${CFG_ACC100_HOST}" + else + ${PFBBCONFIG_TOOL} "${fec_type}" -c "${CFG_ACC100_POD}" + fi + else + if [[ ${vf_num} -eq 0 ]]; then + ${PFBBCONFIG_TOOL} "${fec_type}" -c "${CFG_ACC200_HOST}" + else + ${PFBBCONFIG_TOOL} "${fec_type}" -v 00112233-4455-6677-8899-aabbccddeeff -c "${CFG_ACC200_POD}" + fi + fi + done < "${CEK_FEC_INFO}" +} + +configure_fec diff --git a/roles/intel_flexran/files/kernel_cmdline_gen.sh b/roles/intel_flexran/files/kernel_cmdline_gen.sh deleted file mode 100644 index b9a2844b..00000000 --- a/roles/intel_flexran/files/kernel_cmdline_gen.sh +++ /dev/null @@ -1,39 +0,0 @@ -#! /bin/bash - -threads_per_core=$(lscpu | grep "Thread(s) per core" | awk -F ':' '{print $2}' | xargs) -cores_per_socket=$(lscpu | grep "Core(s) per socket" | awk -F ':' '{print $2}' | xargs) -socket=$(lscpu | grep "Socket(s)" |awk -F ':' '{print $2}' | xargs) - - -# On socket 0, core 0-1 and its sibling thread core will be kept for housekeeping. -# On socket 1, the first two cores and its sibling will be kept for housekeeping -# all the other cores will be isolated - -# Set isolcpus and housekeeping -if [ "$socket" == "1" ] ; then - if [ "$threads_per_core" == "2" ] ; then - isolcpus="2-$(( cores_per_socket - 1 )),$(( cores_per_socket + 2 ))-$(( cores_per_socket * 2 - 1 ))" - housekeeping="0-1,$(( cores_per_socket ))-$(( cores_per_socket + 1 ))" - else - isolcpus="2-$(( cores_per_socket - 1 ))" - housekeeping="0-1" - fi -elif [ "$socket" == "2" ]; then - if [ "$threads_per_core" == "2" ] ; then - isolcpus="2-$(( cores_per_socket - 1 )),$(( cores_per_socket + 2 ))-$(( cores_per_socket * 2 - 1 )),$(( cores_per_socket * 2 + 2 ))-$(( cores_per_socket * 3 - 1 )),$(( cores_per_socket * 3 + 2 ))-$(( cores_per_socket * 4 - 1 ))" - housekeeping="0-1,$(( cores_per_socket ))-$(( cores_per_socket + 1 )),$(( cores_per_socket * 2 ))-$(( cores_per_socket * 2 + 1 )),$(( cores_per_socket * 3 ))-$(( cores_per_socket * 3 + 1 ))" - else - isolcpus="2-$(( cores_per_socket - 1 )),$(( cores_per_socket + 2 ))-$(( cores_per_socket * 2 - 1 ))" - housekeeping="0-1,$(( cores_per_socket ))-$(( cores_per_socket + 1 ))" - fi -fi - -# Set hugepage size -if [ "$cores_per_socket" -lt "32" ] ; then - pagesize="40" -else - pagesize="60" -fi - -flexran_kernel_cmdline="hugepagesz=1G hugepages=$pagesize hugepagesz=2M hugepages=0 default_hugepagesz=1G nmi_watchdog=0 softlockup_panic=0 intel_iommu=on iommu=pt vfio_pci.enable_sriov=1 vfio_pci.disable_idle_d3=1 rcu_nocbs=$isolcpus irqaffinity=$housekeeping isolcpus=managed_irq,domain,$isolcpus kthread_cpus=$housekeeping nohz_full=$isolcpus crashkernel=auto enforcing=0 quiet rcu_nocb_poll rhgb selinux=0 mce=off audit=0 pci=realloc pci=assign-busses rdt=l3cat skew_tick=1 nosoftlockup nohz=on" -echo "$flexran_kernel_cmdline" diff --git a/roles/intel_flexran/tasks/bind_eth.yml b/roles/intel_flexran/tasks/bind_eth.yml index f18d5618..d986d24f 100644 --- a/roles/intel_flexran/tasks/bind_eth.yml +++ b/roles/intel_flexran/tasks/bind_eth.yml @@ -15,7 +15,8 @@ ## --- - name: set DP VFs on DP0 for FlexRAN in Docker POD - shell: "for i in {0..3}; do ip link set {{ dataplane_interfaces[0].name }} vf ${i} mac 00:11:22:33:00:${i}0; done; ip link show {{ dataplane_interfaces[0].name }}" # noqa yaml[line-length] + ansible.builtin.shell: >- + for i in {0..3}; do ip link set {{ dataplane_interfaces[0].name }} vf ${i} mac 00:11:22:33:00:${i}0; done; ip link show {{ dataplane_interfaces[0].name }} args: executable: /bin/bash register: vfs_macs_dp0 @@ -23,7 +24,8 @@ failed_when: vfs_macs_dp0.rc != 0 - name: set DP VFs on DP1 for FlexRAN in Docker POD - shell: "for i in {0..3}; do ip link set {{ dataplane_interfaces[1].name }} vf ${i} mac 00:11:22:33:00:${i}1; done; ip link show {{ dataplane_interfaces[1].name }}" # noqa yaml[line-length] + ansible.builtin.shell: >- + for i in {0..3}; do ip link set {{ dataplane_interfaces[1].name }} vf ${i} mac 00:11:22:33:00:${i}1; done; ip link show {{ dataplane_interfaces[1].name }} args: executable: /bin/bash register: vfs_macs_dp1 @@ -31,45 +33,73 @@ failed_when: vfs_macs_dp1.rc != 0 - name: show DP VFs config - debug: msg="{{ vfs_macs_dp0.stdout }}\n{{ vfs_macs_dp1.stdout }}" + ansible.builtin.debug: + msg: "{{ vfs_macs_dp0.stdout }}\n{{ vfs_macs_dp1.stdout }}" -- name: load vfio-pci - modprobe: - name: vfio-pci - state: present +- name: copy NIC VFs MAC setup script to /usr/local/bin + copy: + src: "{{ role_path }}/files/cek_flexran_nic_mac.sh" + dest: /usr/local/bin/cek_flexran_nic_mac.sh + owner: root + group: root + mode: '0700' + become: yes + +- name: create systemd unit file + template: + src: cek_flexran_nic_mac.service.j2 + dest: /lib/systemd/system/cek_flexran_nic_mac.service + owner: root + group: root + mode: '0644' + become: yes + +- name: ensure that systemd service is enabled on startup and restarted to apply the configuration + systemd: + name: cek_flexran_nic_mac + state: restarted + enabled: yes + daemon_reload: yes + become: yes - name: show DPDK devices binding for FlexRAN in Docker POD - ansible.builtin.command: "{{ (intel_flexran_dpdk_dir, 'usertools', 'dpdk-devbind.py -s') | path_join }}" + ansible.builtin.command: >- + dpdk-devbind.py -s register: devbind_status changed_when: false -- debug: msg="{{ devbind_status.stdout }}" +- ansible.builtin.debug: + msg: "{{ devbind_status.stdout }}" - name: restart SRIOV NET DP daemonset to re-initialize resources for FlexRAN in Docker POD - ansible.builtin.command: kubectl rollout restart -n kube-system daemonset sriov-net-dp-kube-sriov-device-plugin-amd64 + ansible.builtin.command: >- + kubectl rollout restart -n kube-system daemonset sriov-net-dp-kube-sriov-device-plugin-amd64 register: sriov_ds_restart changed_when: sriov_ds_restart.rc == 0 failed_when: sriov_ds_restart.rc != 0 or ("restarted" not in sriov_ds_restart.stdout) -- debug: msg="{{ sriov_ds_restart.stdout }}" +- ansible.builtin.debug: + msg: "{{ sriov_ds_restart.stdout }}" - name: cluster check-point ansible.builtin.include_role: name: wait_for_kubernetes_ready - name: short wait for SRIOV DP to instate FEC VF resource - pause: + ansible.builtin.pause: seconds: 10 - name: dump resources for FlexRAN in Docker POD - shell: "set -o pipefail && kubectl get node {{ groups['kube_node'][0] }} -o json | jq '.status.allocatable'" # noqa command-instead-of-shell + ansible.builtin.shell: >- + set -o pipefail && kubectl get node {{ groups['kube_node'][0] }} -o json | jq '.status.allocatable' args: executable: /bin/bash register: node_resources changed_when: false failed_when: node_resources.rc != 0 -- debug: msg="{{ node_resources.stdout }}" +- ansible.builtin.debug: + msg: "{{ node_resources.stdout }}" - name: check 'intel_fec_5g resources' for FlexRAN in Docker POD ansible.builtin.assert: diff --git a/roles/intel_flexran/tasks/bind_fec.yml b/roles/intel_flexran/tasks/bind_fec.yml index 8d4decd2..2a2b4bc0 100644 --- a/roles/intel_flexran/tasks/bind_fec.yml +++ b/roles/intel_flexran/tasks/bind_fec.yml @@ -14,152 +14,97 @@ ## limitations under the License. ## --- -- name: check igb_uio module is loaded - shell: "set -o pipefail && lsmod | grep -i igb_uio" - args: - executable: /bin/bash - register: igb_uio_module - changed_when: false - failed_when: igb_uio_module.rc != 0 - -- name: insert igb_uio module - when: "'igb_uio' not in igb_uio_module.stdout" - block: - - name: clone DPDK-KMODS repository - git: - repo: "http://dpdk.org/git/dpdk-kmods" - version: "e68a705cc5dc3d1333bbcd722fe4e9a6ba3ee648" # latest as of July 2022 - dest: "{{ (intel_flexran_dpdk_dir, 'dpdk-kmods') | path_join }}" - force: yes - -# - name: build igb_uio -# make: -# chdir: "{{ (intel_flexran_dpdk_dir, 'dpdk-kmods/linux/igb_uio') | path_join }}" +- ansible.builtin.debug: + msg: "fec_acc pciid is {{ fec_acc | regex_replace('^ip-', '') }}" -# - name: build igb_uio with command -# command: make -# args: -# chdir: "{{ (intel_flexran_dpdk_dir, 'dpdk-kmods/linux/igb_uio') | path_join }}" - - # Using shell since the make built-in module and command (above) both get errors - - name: build igb_uio with shell - shell: "make" # noqa command-instead-of-shell - args: - executable: /bin/bash - chdir: "{{ (intel_flexran_dpdk_dir, 'dpdk-kmods/linux/igb_uio') | path_join }}" - changed_when: true +# Configure PF for HOST +- name: set FEC PF Driver + set_fact: + fec_pf_driver: "{{ 'vfio-pci' if (fec_dev == 'acc200' and intel_flexran_type == 'pod') else 'igb_uio' }}" - - name: load uio - modprobe: - name: uio - state: present - - - name: load igb_uio - command: insmod igb_uio.ko - args: - chdir: "{{ (intel_flexran_dpdk_dir, 'dpdk-kmods/linux/igb_uio') | path_join }}" - changed_when: false - failed_when: false - -- debug: msg="fec_acc pciid is {{ fec_acc | regex_replace('^ip-', '') }}" - -- name: bind FEC ACC physical device to igb_uio driver - ansible.builtin.command: "{{ (intel_flexran_dpdk_dir, 'usertools', 'dpdk-devbind.py -b igb_uio ' + fec_acc) | path_join }}" +- name: bind FEC ACC physical device + ansible.builtin.command: >- + dpdk-devbind.py -b {{ fec_pf_driver }} {{ fec_acc }} register: fec_bind_set changed_when: fec_bind_set.rc == 0 - when: fec_dev != "acc200" or intel_flexran_type != "pod" -- name: bind FEC ACC physical device to vfio-pci driver - ansible.builtin.command: "{{ (intel_flexran_dpdk_dir, 'usertools', 'dpdk-devbind.py -b vfio-pci ' + fec_acc) | path_join }}" - register: fec_bind_set - changed_when: fec_bind_set.rc == 0 - when: fec_dev == "acc200" and intel_flexran_type == "pod" - -- name: show DPDK devices binding status - ansible.builtin.command: "{{ (intel_flexran_dpdk_dir, 'usertools', 'dpdk-devbind.py -s') | path_join }}" +- name: show FEC devices binding status + ansible.builtin.command: >- + dpdk-devbind.py --status-dev baseband register: fec_bind_show changed_when: false -- debug: msg="{{ fec_bind_show.stdout }}" +- ansible.builtin.debug: + msg: "{{ fec_bind_show.stdout }}" +# Create and configure VF for POD - name: set FEC ACC virtual device for FlexRAN in Docker POD block: - - name: load vfio-pci - modprobe: - name: vfio-pci - state: present - - - name: reset FEC VFs for acc100 -# ansible.builtin.copy: -# dest: "{{ ('/sys/bus/pci/devices/', fec_acc, 'max_vfs') | path_join }}" -# content: "0" -# force: yes -# MSG: -# The destination directory (/sys/bus/pci/devices/0000:51:00.0) is not writable by the current user. -# Error was: [Errno 13] Permission denied: #b'/sys/bus/pci/devices/0000:51:00.0/.ansible_tmpx26av47dmax_vfs' - shell: "echo 0 > {{ ('/sys/bus/pci/devices/', fec_acc, 'max_vfs') | path_join }}" # noqa command-instead-of-shell - args: - executable: /bin/bash - register: set_fec_max_vfs - changed_when: set_fec_max_vfs.rc == 0 - failed_when: set_fec_max_vfs.rc != 0 - when: fec_dev == "acc100" - - - name: reset FEC VFs for acc200 - ansible.builtin.shell: "echo 0 > {{ ('/sys/bus/pci/devices/', fec_acc, 'sriov_numvfs') | path_join }}" # noqa command-instead-of-shell - args: - executable: /bin/bash - register: set_fec_max_vfs - changed_when: set_fec_max_vfs.rc == 0 - failed_when: set_fec_max_vfs.rc != 0 - when: fec_dev == "acc200" - - - name: instate one FEC VF for acc100 -# ansible.builtin.copy: -# dest: "{{ ('/sys/bus/pci/devices/', fec_acc, 'max_vfs') | path_join }}" -# content: "1" -# force: yes - shell: "echo 1 > {{ ('/sys/bus/pci/devices/', fec_acc, 'max_vfs') | path_join }}" - args: - executable: /bin/bash - register: set_fec_max_vfs - changed_when: set_fec_max_vfs.rc == 0 - failed_when: set_fec_max_vfs.rc != 0 - when: fec_dev == "acc100" + - name: set vfs_directory + set_fact: + vfs_directory: "{{ 'sriov_numvfs' if fec_dev == 'acc200' else 'max_vfs' }}" - - name: instate one FEC VF for acc200 - ansible.builtin.shell: "echo 1 > {{ ('/sys/bus/pci/devices/', fec_acc, 'sriov_numvfs') | path_join }}" + - name: reset FEC VFs and instate one FEC VF + ansible.builtin.shell: >- + echo {{ item }} > {{ ('/sys/bus/pci/devices/', fec_acc, vfs_directory) | path_join }} args: executable: /bin/bash - register: set_fec_max_vfs - changed_when: set_fec_max_vfs.rc == 0 - failed_when: set_fec_max_vfs.rc != 0 - when: fec_dev == "acc200" + register: set_fec_vfs + changed_when: set_fec_vfs.rc == 0 + failed_when: set_fec_vfs.rc != 0 + with_items: + - 0 + - 1 - name: probe for FEC VF - shell: "set -o pipefail && lspci | grep -i acc | grep -i -E \"0d5d|57c1\"" # noqa command-instead-of-shell + ansible.builtin.shell: >- + set -o pipefail && lspci | grep -i acc | grep -i -e 0d5d -e 57c1 args: executable: /bin/bash register: acc_vf_probe changed_when: false failed_when: acc_vf_probe.rc != 0 - - debug: msg="{{ acc_vf_probe.stdout }}" + - ansible.builtin.debug: + msg: "{{ acc_vf_probe.stdout }}" - - set_fact: + - ansible.builtin.set_fact: fec_acc_vf: "{{ acc_vf_probe.stdout.split(' ').0 }}" - - debug: msg="fec_acc_vf is at {{ fec_acc_vf }}" + - ansible.builtin.debug: + msg: "fec_acc_vf is at {{ fec_acc_vf }}" - name: bind FEC ACC VF device - shell: "./dpdk-devbind.py -b vfio-pci {{ fec_acc_vf }} && ./dpdk-devbind.py -s" # noqa command-instead-of-shell + ansible.builtin.shell: >- + dpdk-devbind.py -b vfio-pci {{ fec_acc_vf }} && dpdk-devbind.py --status-dev baseband args: executable: /bin/bash - chdir: "{{ (intel_flexran_dpdk_dir, 'usertools') | path_join }}" register: fec_bind changed_when: fec_bind.rc == 0 failed_when: fec_bind.rc != 0 - - debug: msg="{{ fec_bind.stdout }}" + - ansible.builtin.debug: + msg: "{{ fec_bind.stdout }}" when: intel_flexran_type == "pod" + +- name: clean up existing FEC info + ansible.builtin.file: + path: "{{ fec_info_path }}/cek_fec_info" + state: absent + become: yes + when: + - not (intel_sriov_fec_operator_enabled | default(false) | bool) + +- name: save FEC info + ansible.builtin.lineinfile: + path: "{{ fec_info_path }}/cek_fec_info" + line: "{{ fec_acc }} {{ fec_dev }} {{ fec_pf_driver }} {{ '1' if intel_flexran_type == 'pod' else '0' }}" + regexp: "^{{ fec_acc }}" + create: yes + owner: root + group: root + mode: '0600' + become: yes + when: + - not (intel_sriov_fec_operator_enabled | default(false) | bool) diff --git a/roles/intel_flexran/tasks/fec_acc.yml b/roles/intel_flexran/tasks/fec_acc.yml index a072f1dd..25531ce3 100644 --- a/roles/intel_flexran/tasks/fec_acc.yml +++ b/roles/intel_flexran/tasks/fec_acc.yml @@ -14,74 +14,58 @@ ## limitations under the License. ## --- -- name: configure ACC100 FEC device with pf_bb tool for FlexRAN in Host - block: - - name: configure ACC100 FEC device with pf_bb tool for FlexRAN in Host - command: "./pf_bb_config acc100 -c ./acc100/acc100_config_pf_4g5g.cfg" # select .cfg file as desired - args: - chdir: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" - register: pf_bb_config - changed_when: false +- name: Set common pf_bb_cfg command + ansible.builtin.set_fact: + pf_bb_cfg_cmd: "./pf_bb_config {{ fec_dev }} -c {{ pf_bb_cfg[fec_dev][intel_flexran_type] }}" - - debug: msg="{{ pf_bb_config.stdout }}" +- name: Set Flexran pod mode w/ ACC200 pf_bb_cfg command + ansible.builtin.set_fact: + pf_bb_cfg_cmd: "./pf_bb_config {{ fec_dev }} -v 00112233-4455-6677-8899-aabbccddeeff -c {{ pf_bb_cfg[fec_dev][intel_flexran_type] }}" when: - - fec_dev == "acc100" - - intel_flexran_type == "host" + - fec_dev == "acc200" + - intel_flexran_type == "pod" -- name: configure ACC100 FEC device with pf_bb tool for FlexRAN in Docker POD - block: - - name: configure ACC100 FEC device with pf_bb tool for FlexRAN in Docker POD - command: "./pf_bb_config acc100 -c ./acc100/acc100_config_vf_5g.cfg" # select .cfg file as desired - args: - chdir: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" - register: pf_bb_config - changed_when: false +- ansible.builtin.debug: + msg: "pf_bb_cfg_cmd: {{ pf_bb_cfg_cmd }}" - - debug: msg="{{ pf_bb_config.stdout }}" - when: - - fec_dev == "acc100" - - intel_flexran_type == "pod" +- name: configure FEC device with pf_bb tool + ansible.builtin.command: + cmd: "{{ pf_bb_cfg_cmd }}" + chdir: "{{ pf_bb_download_dir }}" + register: pf_bb_config + changed_when: false -- name: configure ACC200 FEC device with pf_bb tool for FlexRAN in Host - block: - - name: configure ACC200 FEC device with pf_bb tool for FlexRAN in Host - command: "./pf_bb_config acc200 -c ./acc200/acc200_config_pf_5g.cfg" # select .cfg file as desired - args: - chdir: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" - register: pf_bb_config - changed_when: false +- ansible.builtin.debug: + msg: "{{ pf_bb_config.stdout }}" - - debug: msg="{{ pf_bb_config.stdout }}" +- name: copy FEC setup script to /usr/local/bin + copy: + src: "{{ role_path }}/files/cek_sriov_fec_init.sh" + dest: /usr/local/bin/cek_sriov_fec_init.sh + owner: root + group: root + mode: '0700' + become: yes when: - - fec_dev == "acc200" - - intel_flexran_type == "host" + - not (intel_sriov_fec_operator_enabled | default(false) | bool) -- name: configure ACC200 FEC device with pf_bb tool for FlexRAN in Docker POD - block: - - name: set FEC device uuid - ansible.builtin.set_fact: - uuid: "00112233-4455-6677-8899-aabbccddeeff" - - name: configure ACC200 FEC device with pf_bb tool for FlexRAN in Docker POD - ansible.builtin.command: - cmd: "./pf_bb_config acc200 -v {{ uuid }} -c acc200/acc200_config_vf_5g.cfg" # select .cfg file as desired - chdir: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" - register: pf_bb_config - changed_when: false - - debug: msg="{{ pf_bb_config.stdout }}" - - name: get the first FEC VF - ansible.builtin.shell: "set -o pipefail && lspci | grep -i acc | grep -i 57c1 | awk '{print $1}' | head -1" - args: - executable: /bin/bash - register: fec_first_vf - changed_when: false - - ansible.builtin.debug: msg="fec_first_vf is at {{ fec_first_vf.stdout }}" - - name: run basic bbdev test - ansible.builtin.command: >- - ./test-bbdev.py -e="-c 0xff0 -a 0000:{{ fec_first_vf.stdout }} - --vfio-vf-token={{ uuid }}" -t 6 -n 100 -b 80 -l 1 -c validation -v ./ldpc_dec_default.data - args: - chdir: "{{ dpdk_dir }}/app/test-bbdev" - changed_when: false +- name: create systemd unit file + template: + src: cek_sriov_fec_init.service.j2 + dest: /lib/systemd/system/cek_sriov_fec_init.service + owner: root + group: root + mode: '0644' + become: yes when: - - fec_dev == "acc200" - - intel_flexran_type == "pod" + - not (intel_sriov_fec_operator_enabled | default(false) | bool) + +- name: ensure that systemd service is enabled on startup and restarted to apply the configuration + systemd: + name: cek_sriov_fec_init + state: restarted + enabled: yes + daemon_reload: yes + become: yes + when: + - not (intel_sriov_fec_operator_enabled | default(false) | bool) diff --git a/roles/intel_flexran/tasks/flexran.yml b/roles/intel_flexran/tasks/flexran.yml index 0d5d30c4..bae25e86 100644 --- a/roles/intel_flexran/tasks/flexran.yml +++ b/roles/intel_flexran/tasks/flexran.yml @@ -14,57 +14,17 @@ ## limitations under the License. ## --- -# no need. per POR, customer must manually pre-extract FlexRAN package on target(s) -# - name: create Intel FlexRAN directory on worker node -# file: -# path: "{{ intel_flexran_dir }}" -# state: directory -# mode: '0755' - -# - name: unpack Intel FlexRAN tarball on target(s) -# unarchive: -# src: "{{ (intel_flexran_staging_location, intel_flexran_tarball) | path_join }}" -# dest: "{{ intel_flexran_dir }}" -# mode: '0755' - -# - name: extract Intel FlexRAN -# shell: "echo | ./extract.sh" # noqa no-changed-when -# args: -# chdir: "{{ intel_flexran_dir }}" - # As the path to the libnuma library is different with Ubuntu, a soft link is created to avoid multiple changes to the makefiles # (FlexRAN hardcoded the path for libnuma) - name: create libnuma symlink - file: + ansible.builtin.file: src: "/usr/lib/x86_64-linux-gnu/libnuma.so" dest: "/usr/lib64/libnuma.so" state: link when: ansible_distribution in ['Ubuntu'] -- name: patch Intel FlexRAN for xx.yy.z release - block: - - name: copy Intel FlexRAN patch file - ansible.builtin.copy: - src: "{{ (intel_flexran_staging_location, intel_flexran_patch) | path_join }}" - dest: "{{ (intel_flexran_dir, intel_flexran_patch) | path_join }}" - mode: '0644' - - - name: unzip Intel FlexRAN patch - ansible.builtin.unarchive: - src: "{{ (intel_flexran_dir, intel_flexran_patch) | path_join }}" - dest: "{{ intel_flexran_dir }}" - remote_src: yes - mode: '0755' - - - name: apply Intel FlexRAN patch - ansible.builtin.command: "./FlexRAN-R{{ intel_flexran_ver }}.sh" - args: - chdir: "{{ intel_flexran_dir }}" - changed_when: true - when: intel_flexran_ver | length > 5 - - name: set DPDK path for Intel FlexRAN - copy: + ansible.builtin.copy: dest: "{{ (intel_flexran_dir, '.flexran_dpdk.path') | path_join }}" content: "{{ intel_flexran_dpdk_dir }}" mode: '0755' @@ -74,35 +34,36 @@ file: ../intel_oneapi_install/vars/main.yml - name: set oneAPI path for Intel FlexRAN - copy: + ansible.builtin.copy: dest: "{{ (intel_flexran_dir, '.flexran_icx.path') | path_join }}" content: "{{ intel_oneapi_install_dir }}" mode: '0755' -- debug: msg="Intel FlexRAN mode is '{{ intel_flexran_mode }}'" +- ansible.builtin.debug: + msg: "Intel FlexRAN mode is '{{ intel_flexran_mode }}'" - name: set Intel FlexRAN mode - lineinfile: + ansible.builtin.lineinfile: path: "{{ (intel_flexran_dir, 'xran/build.sh') | path_join }}" regexp: '^SAMPLEAPP=0' line: SAMPLEAPP=1 when: intel_flexran_mode == "xran" -# Only for 5GISA. Also needs 1 in _timer.xml -# - name: set Intel FlexRAN target isa -# set_fact: -# target_isa: "-i spr" -# when: configured_arch == "spr" - - name: build Intel FlexRAN SDK - shell: "source set_env_var.sh -d && ./flexran_build.sh -e -r 5gnr {{ target_isa | default('') }} -m sdk" + ansible.builtin.shell: >- + source set_env_var.sh -d && + ./flexran_build.sh -e -r 5gnr -m sdk args: executable: /bin/bash chdir: "{{ intel_flexran_dir }}" changed_when: true - name: build FlexRAN ALL for 5GNR - shell: "ldconfig && export RTE_SDK={{ intel_flexran_dpdk_dir }} && source set_env_var.sh -d && export PKG_CONFIG_PATH=$RTE_SDK/build/meson-uninstalled && ./flexran_build.sh -e -r 5gnr {{ target_isa | default('') }}" # noqa yaml[line-length] + ansible.builtin.shell: >- + ldconfig && export RTE_SDK={{ intel_flexran_dpdk_dir }} && + source set_env_var.sh -d && + export PKG_CONFIG_PATH=$RTE_SDK/build/meson-uninstalled && + ./flexran_build.sh -e -r 5gnr args: executable: /bin/bash chdir: "{{ intel_flexran_dir }}" diff --git a/roles/intel_flexran/tasks/flexran_preflight.yml b/roles/intel_flexran/tasks/flexran_preflight.yml index be939424..d0af66c4 100644 --- a/roles/intel_flexran/tasks/flexran_preflight.yml +++ b/roles/intel_flexran/tasks/flexran_preflight.yml @@ -62,8 +62,14 @@ - debug: msg="BBU_FH={{ intel_flexran_bbu_front_haul }} BBU_PS={{ intel_flexran_bbu_ptp_sync }} oRU_FH={{ intel_flexran_oru_front_haul }} oRU_PS={{ intel_flexran_oru_ptp_sync }}" # noqa yaml[line-length] - name: check network for FlexRAN assert: - that: "intel_flexran_bbu_front_haul is defined and intel_flexran_bbu_ptp_sync is defined and intel_flexran_oru_front_haul is defined and intel_flexran_oru_ptp_sync is defined" # noqa yaml[line-length] - msg: "Intel FlexRAN on bare-metal host in xRAN test mode requires defining the network devices for 'Front Haul' and 'PTP Sync'. See docs/flexran_guide.md" + that: + - intel_flexran_bbu_front_haul is defined + - intel_flexran_bbu_ptp_sync is defined + - intel_flexran_oru_front_haul is defined + - intel_flexran_oru_ptp_sync is defined + msg: >- + Intel FlexRAN on bare-metal host in xRAN test mode requires defining the network devices for 'Front Haul' and 'PTP Sync'. + See docs/flexran_guide.md. when: intel_flexran_mode == 'xran' # check NIC for FlexRAN @@ -81,7 +87,7 @@ - name: check DP Interfaces ansible.builtin.assert: - that: "{{ dataplane_interfaces | length }} >= 2" + that: dataplane_interfaces | length >= 2 fail_msg: "For FlexRAN xRAN test mode, at least TWO dataplane (DP) interface(s) on target '{{ ansible_hostname }}' must be set in host_vars. Please correct the configuration" # noqa yaml[line-length] success_msg: "Assertion passed. Two (or more) dataplane (DP) interface(s) are configured." when: intel_flexran_mode == 'xran' @@ -99,38 +105,6 @@ success_msg: "Assertion passed. Intel FlexRAN on bare-metal host is supported and can be deployed on target with {{ cpu_id }} CPU" failed_when: false - # check patch when FlexRAN xx.yy.z - - name: check FlexRAN PATCH Release - when: intel_flexran_ver | length > 5 - block: - - debug: msg="Expecting file {{ (intel_flexran_staging_location, intel_flexran_patch) | path_join }} on local ansible host" - - - name: probe for FlexRAN xx.yy.z patch - delegate_to: localhost - become: false - stat: - path: "{{ (intel_flexran_staging_location, intel_flexran_patch) | path_join }}" - checksum_algorithm: sha256 - register: provided_flexran_patch - - - debug: msg="{{ intel_flexran_patch }} exists is {{ provided_flexran_patch.stat.exists }}" - - - name: check the FlexRAN xx.yy.z patch name - assert: - that: "provided_flexran_patch.stat.exists" - msg: - - Mandatory file {{ (intel_flexran_staging_location, intel_flexran_patch) | path_join }} does NOT exist on localhost. - - Please acquire the zip file and place it in the location indicated above in order to deploy FlexRAN. See docs/flexran_guide.md - - - debug: msg="{{ intel_flexran_patch }} checksum is {{ provided_flexran_patch.stat.checksum }}" - - - name: check the FlexRAN xx.yy.z patch integrity - assert: - that: "provided_flexran_patch.stat.checksum == '{{ intel_flexran_patch_chk }}'" - msg: - - File {{ (intel_flexran_staging_location, intel_flexran_patch) | path_join }} on localhost is NOT the expected one. - - Please provide the correct file. See docs/flexran_guide.md - # check DPDK patch for FlexRAN - debug: msg="Expecting file {{ (dpdk_local_patches_dir, 'dpdk-' + dpdk_version, intel_flexran_dpdk_patch) | path_join }} on local ansible host" @@ -146,7 +120,7 @@ - name: check the FlexRAN DPDK patch name assert: - that: "provided_flexran_dpdk_patch.stat.exists" + that: provided_flexran_dpdk_patch.stat.exists msg: - Mandatory file {{ (dpdk_local_patches_dir, 'dpdk-' + dpdk_version, intel_flexran_dpdk_patch) | path_join }} does NOT exist on localhost. - Please acquire the DPDK patch.zip and unzip it in the location indicated above in order to deploy FlexRAN. See docs/flexran_guide.md @@ -155,7 +129,7 @@ - name: check the FlexRAN DPDK patch integrity assert: - that: "provided_flexran_dpdk_patch.stat.checksum == '{{ intel_flexran_dpdk_patch_chk }}'" + that: provided_flexran_dpdk_patch.stat.checksum == intel_flexran_dpdk_patch_chk msg: - File {{ (dpdk_local_patches_dir, 'dpdk-' + dpdk_version, intel_flexran_dpdk_patch) | path_join }} on localhost is NOT the expected one. - Please provide the correct file. See docs/flexran_guide.md @@ -194,8 +168,8 @@ - name: check linux distro version and kernel for FlexRAN assert: that: > - (ansible_distribution == 'Ubuntu' and ansible_distribution_version == '22.04' and 'realtime' in ansible_kernel) or - (ansible_distribution == 'RedHat' and ansible_distribution_version == '9.2' and 'rt' in ansible_kernel) + (ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('22.04', '==') and 'realtime' in ansible_kernel) or + (ansible_distribution == 'RedHat' and ansible_distribution_version is version('9.2', '==') and 'rt' in ansible_kernel) msg: - Deploying Intel FlexRAN is supported only on Ubuntu 22.04 or RHEL 9.2 and with real-time kernel. - Please prepare accordingly the o/s image on target(s) or disable FlexRAN. See docs/flexran_guide.md @@ -217,7 +191,9 @@ - name: check FlexRAN folders assert: - that: "item.stat.exists and item.stat.isdir" + that: + - item.stat.exists + - item.stat.isdir msg: - Directory '{{ item.item }}' is missing on target '{{ inventory_hostname }}' - Deploying Intel FlexRAN requires the tarball package to be pre-extracted on the worker node. See docs/flexran_guide.md @@ -246,8 +222,10 @@ - debug: msg="Linux distribution on target is {{ ansible_distribution }} {{ ansible_distribution_version }} ({{ ansible_distribution_release }}) with {{ ansible_kernel }} kernel" # noqa yaml[line-length] - name: check linux distro version and kernel for FlexRAN in Docker POD ansible.builtin.assert: - that: > - (ansible_distribution == 'Ubuntu' and ansible_distribution_version == '22.04' and 'realtime' in ansible_kernel) + that: + - ansible_distribution == "Ubuntu" + - ansible_distribution_version is version('22.04', '==') + - "'realtime' in ansible_kernel" fail_msg: - Deploying Intel FlexRAN in Docker POD is supported only on Ubuntu 22.04 with real-time kernel. - Please prepare accordingly the o/s image on target(s) or disable FlexRAN. See docs/flexran_guide.md @@ -260,8 +238,11 @@ - debug: msg="CPU={{ ansible_processor[2] }} cores={{ ansible_processor_cores }} count={{ ansible_processor_count }} nproc={{ ansible_processor_nproc }} tpc={{ ansible_processor_threads_per_core }} vcpus={{ ansible_processor_vcpus }}" # noqa yaml[line-length] - name: check CPU for FlexRAN in Docker POD on ICX-SP ansible.builtin.assert: - that: "ansible_processor_count == 1 and ansible_processor_cores == 32 and cpu_id == '6338N'" - fail_msg: "Intel FlexRAN in Docker POD on ICL-SP requires worker with single 32-cores ICX 6338N CPU" + that: + - ansible_processor_count == 1 + - ansible_processor_cores == 32 + - cpu_id == '6338N' + fail_msg: "Intel FlexRAN in Docker POD on ICX-SP requires worker with single 32-cores ICX 6338N CPU" success_msg: "Assertion passed. Intel FlexRAN in Docker POD is supported and can be deployed on target with ICX {{ cpu_id }} CPU" when: - configured_arch == "icx" @@ -276,12 +257,14 @@ - configured_arch == "spr" # check runtime for FlexRAN in POD on SPR - # needs containerd runtime to support non-root user to use FEC in pod + # needs containerd runtime to support common user to use FEC in pod - debug: msg="Container runtime is set to {{ container_runtime }}" - name: check runtime for FlexRAN in POD on SPR ansible.builtin.assert: that: container_runtime == "containerd" - fail_msg: "Deploying Intel FlexRAN as a POD on SPR needs containerd runtime to support non-root user to use FEC in pod. Please correct the group_vars configuration" + fail_msg: >- + Deploying Intel FlexRAN as a POD on SPR needs containerd runtime. + Please correct the group_vars configuration. success_msg: "Assertion passed. Intel FlexRAN as a POD on SPR is supported and can be deployed on '{{ container_runtime }}' runtime" when: - configured_arch == "spr" @@ -309,7 +292,7 @@ when: intel_flexran_mode == 'xran' - name: check DP Interfaces ansible.builtin.assert: - that: "{{ dataplane_interfaces | length }} >= 2" + that: dataplane_interfaces | length >= 2 fail_msg: "At least TWO dataplane (DP) interface(s) on target '{{ ansible_hostname }}' must be set in host_vars. Please correct the configuration" success_msg: "Assertion passed. Two (or more) dataplane (DP) interface(s) are configured." when: intel_flexran_mode == 'xran' diff --git a/roles/intel_flexran/tasks/main.yml b/roles/intel_flexran/tasks/main.yml index a07103f3..fbb4fd58 100644 --- a/roles/intel_flexran/tasks/main.yml +++ b/roles/intel_flexran/tasks/main.yml @@ -14,7 +14,14 @@ ## limitations under the License. ## --- -- debug: msg="Entering FlexRAN Role" +- ansible.builtin.debug: + msg: "Entering FlexRAN Role" + +- name: create Intel FlexRAN files directory + ansible.builtin.file: + path: "{{ intel_flexran_files_dir }}" + state: directory + mode: '0755' - name: remove oRU from inventory (if defined) to skip its tasks in 'timer' test mode block: @@ -82,7 +89,8 @@ - inventory_hostname == groups['kube_node'][0] - not intel_sriov_fec_operator_enabled -- debug: msg="FEC Accelerator device at {{ fec_acc }} is {{ fec_dev }}" +- ansible.builtin.debug: + msg: "FEC Accelerator device at {{ fec_acc }} is {{ fec_dev }}" when: inventory_hostname == groups['kube_node'][0] - name: bind dpdk drivers to FEC ACC device(s) diff --git a/roles/intel_flexran/tasks/pf_bb.yml b/roles/intel_flexran/tasks/pf_bb.yml index 76839e40..771f7c0b 100644 --- a/roles/intel_flexran/tasks/pf_bb.yml +++ b/roles/intel_flexran/tasks/pf_bb.yml @@ -14,32 +14,13 @@ ## limitations under the License. ## --- -- name: clone inih repository - git: - repo: "https://github.com/benhoyt/inih" - version: "{{ inih_version }}" - dest: "{{ (intel_flexran_dir, 'source/inih') | path_join }}" - force: yes - -- name: build inih - make: - file: "Makefile.static" - chdir: "{{ (intel_flexran_dir, 'source/inih/extra') | path_join }}" - -- name: copy libinih.a - copy: - remote_src: yes - src: "{{ (intel_flexran_dir, 'source/inih/extra', 'libinih.a') | path_join }}" - dest: "{{ (intel_flexran_dir, 'source/inih/') | path_join }}" - mode: '0755' - - name: clone pf-bb-config repository - git: + ansible.builtin.git: repo: "https://github.com/intel/pf-bb-config.git" version: "{{ intel_pfbb_version }}" - dest: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" + dest: "{{ pf_bb_download_dir }}" force: yes - name: build pf-bb-config tool - make: - chdir: "{{ (intel_flexran_dir, 'source/pf-bb-config') | path_join }}" + community.general.make: + chdir: "{{ pf_bb_download_dir }}" diff --git a/roles/intel_flexran/tasks/pod.yml b/roles/intel_flexran/tasks/pod.yml index 1b9bf9ed..6528cfa6 100644 --- a/roles/intel_flexran/tasks/pod.yml +++ b/roles/intel_flexran/tasks/pod.yml @@ -16,10 +16,10 @@ --- - name: Enable non root user to start Flexran pod on SPR-EE block: - - name: enable device_ownership_from_security_context in /etc/containerd/config.toml + - name: enable device_ownership_from_security_context in containerd daemon configuration ansible.builtin.lineinfile: - path: /etc/containerd/config.toml - insertafter: 'enable_unprivileged_icmp = false' + path: "{{ containerd_conf_file }}" + insertafter: 'enable_unprivileged_icmp' line: " device_ownership_from_security_context = true" - name: containerd | restart containerd ansible.builtin.systemd: @@ -27,11 +27,17 @@ state: restarted enabled: yes daemon-reload: yes + when: kube_provisioner == 'kubespray' + - name: containerd | restart rke2 server + ansible.builtin.systemd: + name: rke2-server.service + state: restarted + when: kube_provisioner == 'rke2' when: - configured_arch == "spr" - container_runtime == "containerd" -- name: generate FlexRAN Docker POD files +- name: generate FlexRAN POD files template: src: "{{ item.src }}" dest: "{{ (intel_flexran_files_dir, item.dst) | path_join }}" @@ -40,57 +46,32 @@ loop: - {src: 'intel_flexran_pod_timer_mode_icx_sp.yaml.j2', dst: 'intel_flexran_pod_timer_mode_icx_sp.yaml'} - {src: 'intel_flexran_pod_xran_mode_icx_sp.yaml.j2', dst: 'intel_flexran_pod_xran_mode_icx_sp.yaml'} - - {src: 'intel_flexran_pod_timer_mode_spr_ee_non_root.yaml.j2', dst: 'intel_flexran_pod_timer_mode_spr_ee_non_root.yaml'} - - {src: 'intel_flexran_pod_xran_mode_spr_ee_non_root.yaml.j2', dst: 'intel_flexran_pod_xran_mode_spr_ee_non_root.yaml'} + - {src: 'intel_flexran_pod_timer_mode_spr_ee.yaml.j2', dst: 'intel_flexran_pod_timer_mode_spr_ee.yaml'} + - {src: 'intel_flexran_pod_xran_mode_spr_ee.yaml.j2', dst: 'intel_flexran_pod_xran_mode_spr_ee.yaml'} -- name: create namespace for FlexRAN as Docker POD - k8s: +- name: create namespace for FlexRAN as POD + kubernetes.core.k8s: name: "{{ intel_flexran_namespace }}" kind: Namespace state: present - name: label node for running FlexRAN as Docker POD - shell: "kubectl label node {{ groups['kube_node'][0] }} testnode=worker1" # noqa command-instead-of-shell + ansible.builtin.shell: >- + kubectl label node {{ groups['kube_node'][0] }} testnode=worker1 args: executable: /bin/bash register: sriov_pod_delete changed_when: sriov_pod_delete.rc == 0 failed_when: sriov_pod_delete.rc != 0 -- name: run FlexRAN as Docker POD in Timer test mode on ICX-SP - k8s: - state: present - src: "{{ (intel_flexran_files_dir, 'intel_flexran_pod_timer_mode_icx_sp.yaml') | path_join }}" - when: - - intel_flexran_mode == "timer" - - configured_arch == "icx" - -- name: run FlexRAN as Docker POD in XRAN test mode on ICX-SP - k8s: - state: present - src: "{{ (intel_flexran_files_dir, 'intel_flexran_pod_xran_mode_icx_sp.yaml') | path_join }}" - when: - - intel_flexran_mode == "xran" - - configured_arch == "icx" - -- name: run FlexRAN as POD in Timer test mode on SPR-EE - k8s: +- name: run FlexRAN as Docker POD + kubernetes.core.k8s: state: present - src: "{{ (intel_flexran_files_dir, 'intel_flexran_pod_timer_mode_spr_ee_non_root.yaml') | path_join }}" - when: - - intel_flexran_mode == "timer" - - configured_arch == "spr" - -- name: run FlexRAN as POD in XRAN test mode on SPR-EE - k8s: - state: present - src: "{{ (intel_flexran_files_dir, 'intel_flexran_pod_xran_mode_spr_ee_non_root.yaml') | path_join }}" - when: - - intel_flexran_mode == "xran" - - configured_arch == "spr" + src: "{{ (intel_flexran_files_dir, flexran_pod[configured_arch][intel_flexran_mode]) | path_join }}" - name: cluster final check-point ansible.builtin.include_role: name: wait_for_kubernetes_ready -- debug: msg="Intel FlexRAN deployment as Docker POD in {{ intel_flexran_mode }} test mode is complete." +- ansible.builtin.debug: + msg: "Intel FlexRAN deployment as Docker POD in {{ intel_flexran_mode }} test mode on arch {{ configured_arch }} is complete." diff --git a/roles/intel_flexran/tasks/timer_mode.yml b/roles/intel_flexran/tasks/timer_mode.yml index 321d72a4..914317c9 100644 --- a/roles/intel_flexran/tasks/timer_mode.yml +++ b/roles/intel_flexran/tasks/timer_mode.yml @@ -48,43 +48,3 @@ msg: - "Intel FlexRAN deployment is complete and Timer Mode configuration is done." - "The worker node is ready for L1/L2 Tests to be executed and verified. See docs/flexran_guide.md" - -# - name: run L1 -# shell: "source set_env_var.sh -d && cd {{ (intel_flexran_dir, 'bin/nr5g/gnb/l1') | path_join }} && ./l1.sh -e" # noqa command-instead-of-shell -# args: -# executable: /bin/bash -# chdir: "{{ intel_flexran_dir }}" -# async: 150 -# poll: 0 -# register: l1 - -# - name: wait l1 ready -# pause: -# seconds: 30 - -# - name: run L2 -# shell: "source set_env_var.sh -d && cd {{ (intel_flexran_dir, 'bin/nr5g/gnb/testmac') | path_join }} && ./l2.sh --testfile=icelake-sp/icxsp_mu0_10mhz_4x4_hton.cfg" # noqa yaml[line-length] -# args: -# executable: /bin/bash -# chdir: "{{ intel_flexran_dir }}" -# async: 120 -# poll: 0 -# register: l2 - -# - name: wait l2 finish -# pause: -# seconds: 150 - -# - name: dump l1 result -# async_status: jid="{{ l1.ansible_job_id }}" -# register: l1_result - -# - debug: msg="{{ l1_result.stdout }}" -# failed_when: false - -# - name: dump l2 result -# async_status: jid="{{ l2.ansible_job_id }}" -# register: l2_result - -# - debug: msg="{{ l2_result.stdout }}" -# failed_when: false diff --git a/roles/intel_flexran/templates/cek_flexran_nic_mac.service.j2 b/roles/intel_flexran/templates/cek_flexran_nic_mac.service.j2 new file mode 100644 index 00000000..00d42720 --- /dev/null +++ b/roles/intel_flexran/templates/cek_flexran_nic_mac.service.j2 @@ -0,0 +1,14 @@ +[Unit] +Description=Intel Container Experience Kits NIC VF MAC configuration for FlexRAN +AssertPathExists=/usr/local/bin/cek_flexran_nic_mac.sh +After=cek_sriov_nic_init.service +Requires=cek_sriov_nic_init.service + +[Service] +Environment=SRIOV_NUMVFS_MAPPINGS={{ sriov_config_path }}/cek_sriov_numvfs +Type=oneshot +ExecStartPre=/bin/sleep 10 +ExecStart=/usr/local/bin/cek_flexran_nic_mac.sh + +[Install] +WantedBy=multi-user.target diff --git a/roles/intel_flexran/templates/cek_sriov_fec_init.service.j2 b/roles/intel_flexran/templates/cek_sriov_fec_init.service.j2 new file mode 100644 index 00000000..f60ca3a8 --- /dev/null +++ b/roles/intel_flexran/templates/cek_sriov_fec_init.service.j2 @@ -0,0 +1,14 @@ +[Unit] +Description=Intel Container Experience Kits FEC device configuration +AssertPathExists=/usr/local/bin/cek_sriov_fec_init.sh + +[Service] +Environment=DEVBIND_TOOL=/usr/local/bin/dpdk-devbind.py +Environment=CEK_FEC_INFO={{ fec_info_path }}/cek_fec_info +Type=oneshot +RemainAfterExit=true +ExecStartPre=/bin/sleep 10 +ExecStart=/usr/local/bin/cek_sriov_fec_init.sh + +[Install] +WantedBy=multi-user.target diff --git a/roles/intel_flexran/templates/intel_flexran_pod_timer_mode_spr_ee_non_root.yaml.j2 b/roles/intel_flexran/templates/intel_flexran_pod_timer_mode_spr_ee.yaml.j2 similarity index 100% rename from roles/intel_flexran/templates/intel_flexran_pod_timer_mode_spr_ee_non_root.yaml.j2 rename to roles/intel_flexran/templates/intel_flexran_pod_timer_mode_spr_ee.yaml.j2 diff --git a/roles/intel_flexran/templates/intel_flexran_pod_xran_mode_spr_ee_non_root.yaml.j2 b/roles/intel_flexran/templates/intel_flexran_pod_xran_mode_spr_ee.yaml.j2 similarity index 100% rename from roles/intel_flexran/templates/intel_flexran_pod_xran_mode_spr_ee_non_root.yaml.j2 rename to roles/intel_flexran/templates/intel_flexran_pod_xran_mode_spr_ee.yaml.j2 diff --git a/roles/intel_flexran/vars/main.yml b/roles/intel_flexran/vars/main.yml index adb643b7..c4cd6560 100644 --- a/roles/intel_flexran/vars/main.yml +++ b/roles/intel_flexran/vars/main.yml @@ -58,3 +58,6 @@ install_dependencies: - libzstd-devel.x86_64 - iproute-devel.x86_64 # - pyelftools # RH8.6RT: "failures": "No package pyelftools available." + +fec_info_path: /etc/cek +sriov_config_path: /etc/cek diff --git a/roles/intel_inband_manageability/defaults/main.yml b/roles/intel_inband_manageability/defaults/main.yml new file mode 100644 index 00000000..88502d80 --- /dev/null +++ b/roles/intel_inband_manageability/defaults/main.yml @@ -0,0 +1,27 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +intel_inbm_path: "{{ ('/opt/intel', 'inbm') | path_join }}" +intel_inbm_git_repo: >- + {{ 'https://gitee.com/mirrors_intel/intel-inb-manageability' + if prc_network + else + 'https://github.com/intel/intel-inb-manageability' }} +intel_inbm_git_tag: "v4.1.4" + +mqtt_ssl_ciphers: + "AES128-CCM-SHA256:CHACHA20-POLY1305-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE\ + -ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256" diff --git a/roles/intel_inband_manageability/tasks/main.yml b/roles/intel_inband_manageability/tasks/main.yml new file mode 100644 index 00000000..f5e1ed84 --- /dev/null +++ b/roles/intel_inband_manageability/tasks/main.yml @@ -0,0 +1,88 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: create INBM base directory + ansible.builtin.file: + path: "{{ intel_inbm_path }}" + state: directory + mode: '0644' + +- name: install prerequisite packages + ansible.builtin.apt: + name: "{{ item }}" + state: present + with_items: + - "docker-buildx-plugin" + - "m4" + +- name: clone INBM git repo + ansible.builtin.git: + repo: "{{ intel_inbm_git_repo }}" + dest: "{{ intel_inbm_path }}" + version: "{{ intel_inbm_git_tag }}" + +- name: build INBM + ansible.builtin.shell: + cmd: "set -o pipefail && ./build.sh" + chdir: "{{ intel_inbm_path }}" + executable: /bin/bash + register: build_result + environment: + HTTP_PROXY: "{{ proxy_env.http_proxy | d('') }}" + HTTPS_PROXY: "{{ proxy_env.https_proxy | d('') }}" + NO_PROXY: "{{ proxy_env.no_proxy | d('') }}" + changed_when: false + failed_when: build_result.rc != 0 + +- name: install INBM + ansible.builtin.shell: + cmd: "set -o pipefail && ./install-tc.sh" + chdir: "{{ (intel_inbm_path, 'dist', 'inbm') | path_join }}" + executable: /bin/bash + become: true + register: install_result + environment: + ACCEPT_INTEL_LICENSE: "true" + NO_CLOUD: "{{ 'x' if intel_inband_manageability_mode == 'inbc' else '' }}" + changed_when: false + failed_when: install_result.rc != 0 + +- name: patch MQTT ssl ciphers + ansible.builtin.lineinfile: + path: /usr/share/intel-manageability/mqtt/mosquitto.conf.dist + regexp: '^ciphers' + line: "ciphers {{ mqtt_ssl_ciphers }}" + become: true + +- name: Change inbc secrets folder permissions + ansible.builtin.file: + path: "/etc/intel-manageability/secret" + state: directory + mode: '0710' + +- name: provision INBM with local INBC + ansible.builtin.shell: + cmd: "set -o pipefail && /usr/bin/provision-tc" + executable: /bin/bash + become: true + register: provision_result + environment: + NO_CLOUD: "x" + PROVISION_TPM: "auto" + NO_OTA_CERT: "1" + changed_when: false + failed_when: provision_result.rc != 0 + when: intel_inband_manageability_mode == 'inbc' diff --git a/roles/intel_inband_manageability/tasks/preflight.yml b/roles/intel_inband_manageability/tasks/preflight.yml new file mode 100644 index 00000000..ac1c098f --- /dev/null +++ b/roles/intel_inband_manageability/tasks/preflight.yml @@ -0,0 +1,29 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: assert intel_inband_manageability_mode is set correctly + ansible.builtin.assert: + that: intel_inband_manageability_mode in ['inbc', 'cloud'] + fail_msg: + "intel_inband_manageability_mode should be one of the following: inbc, cloud." + +- name: Intel In-Band Manageability OS check + ansible.builtin.assert: + that: + - ansible_distribution == "Ubuntu" + - ansible_distribution_version in ['20.04', '22.04'] + msg: >- + Currently Intel In-Band Manageability is only supported on Ubuntu 20.04 and 22.04. diff --git a/roles/intel_media_analytics/defaults/main.yaml b/roles/intel_media_analytics/defaults/main.yaml index c3a21bf6..4ae674ce 100644 --- a/roles/intel_media_analytics/defaults/main.yaml +++ b/roles/intel_media_analytics/defaults/main.yaml @@ -18,12 +18,12 @@ intel_media_analytics_namespace: "intel-media" intel_media_analytics__release_name: "intel-media" # Media Analytics -intel_media_analytics_image_src: "intel/dlstreamer" -intel_media_analytics_image_tag: "2022.3.0-ubuntu22-gpu555-dpcpp" +intel_media_analytics_image_src: "vss-dlstreamer" +intel_media_analytics_image_tag: "24.1" intel_media_analytics_local_folder: "{{ (project_root_dir, 'intel-media') | path_join }}" intel_media_analytics_local_build_name: "intel-media" -intel_media_analytics_local_build_tag: "v23.02" intel_media_analytics_sample_pod_name: "intel-media" +intel_media_analytics_pod_username: vss diff --git a/roles/intel_media_analytics/files/install-models.sh b/roles/intel_media_analytics/files/install-models.sh new file mode 100644 index 00000000..8e8f8e48 --- /dev/null +++ b/roles/intel_media_analytics/files/install-models.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +MODEL_PATH=${HOME}/models + +cd "${HOME}" || exit +rm -rf yolov5-v6.2 + +git clone --branch v6.2 --depth 1 https://github.com/ultralytics/yolov5 yolov5-v6.2 + +pip install -r yolov5-v6.2/requirements.txt +pip install onnx==1.12.0 torch==1.13.0 torchvision==0.14.0 openvino-dev==2022.3 + +mkdir -p "${MODEL_PATH}"/public/yolov5m/ +curl -L --output "${HOME}"/cars-on-highway.1920x1080.mp4 "https://www.pexels.com/video/854671/download/?h=1080&w=1920" +curl -L --output "${MODEL_PATH}"/public/yolov5m/yolov5m.pt \ + https://github.com/ultralytics/yolov5/releases/download/v6.2/yolov5m.pt +cd yolov5-v6.2 && python3 export.py --weights "${MODEL_PATH}"/public/yolov5m/yolov5m.pt \ + --imgsz 640 --batch 1 --include onnx +mo --input_model "${MODEL_PATH}"/public/yolov5m/yolov5m.onnx --model_name yolov5m \ + --scale 255 --reverse_input_channels \ + --output /model.24/m.0/Conv,/model.24/m.1/Conv,/model.24/m.2/Conv \ + --data_type FP16 \ + --output_dir "${MODEL_PATH}"/public/yolov5m/FP16 +curl -L --output "${MODEL_PATH}"/public/yolov5m/yolov5m.json \ + https://raw.githubusercontent.com/dlstreamer/dlstreamer/2022.3-release/samples/gstreamer/model_proc/public/yolo-v5.json +curl -L --output "${MODEL_PATH}"/public/yolov5m/coco_80cl.txt \ + https://github.com/dlstreamer/dlstreamer/blob/2022.3-release/samples/labels/coco_80cl.txt?raw=true diff --git a/roles/intel_media_analytics/files/media-analytics-test.sh b/roles/intel_media_analytics/files/media-analytics-test.sh new file mode 100644 index 00000000..b520e4dd --- /dev/null +++ b/roles/intel_media_analytics/files/media-analytics-test.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +source /opt/intel/oneapi/setvars.sh +source /opt/intel/openvino/setupvars.sh +source /opt/intel/dlstreamer/setupvars.sh + +VIDEO_IN=${1:-cars-on-highway.1920x1080.mp4} +VIDEO_OUT=${2:-cars-on-highway-annotated.mp4} + +DET_MODEL=models/public/yolov5m/FP16/yolov5m.xml +DET_MODEL_PROC=models/public/yolov5m/yolov5m.json + +gst-launch-1.0 -e filesrc location="${VIDEO_IN}" ! \ +qtdemux ! \ +h264parse ! \ +vaapih264dec ! \ +video/x-raw\(memory:VASurface\) ! \ +gvadetect \ +pre-process-backend=vaapi-surface-sharing \ +pre-process-config=VAAPI_FAST_SCALE_LOAD_FACTOR=1 \ +model=${DET_MODEL} \ +model-proc=${DET_MODEL_PROC} \ +device=GPU ! \ +meta_overlay ! \ +gvafpscounter ! \ +vaapih264enc ! \ +h264parse ! \ +mp4mux ! \ +filesink \ +location=/tmp/"${VIDEO_OUT}" diff --git a/roles/intel_media_analytics/files/run_vehicle_detection_attribute.sh b/roles/intel_media_analytics/files/run_vehicle_detection_attribute.sh deleted file mode 100644 index 87ff9d10..00000000 --- a/roles/intel_media_analytics/files/run_vehicle_detection_attribute.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash - -VIDEO_IN=${1:-cars-on-highway.1920x1080.mp4} -VIDEO_OUT=${2:-cars-on-highway-annotated.mp4} - -# shellcheck source=/dev/null -source /opt/intel/openvino_2022/setupvars.sh -# shellcheck source=/dev/null -source /opt/intel/dlstreamer/setupvars.sh - -DET_MODEL=models/public/yolov5m/FP16/yolov5m.xml -DET_MODEL_PROC=models/public/yolov5m/yolov5m.json -DET_LABEL='labels-file=models/public/yolov5m/coco_80cl.txt' - -CLS_MODEL=models/intel/vehicle-attributes-recognition-barrier-0039/FP16-INT8/vehicle-attributes-recognition-barrier-0039.xml -CLS_MODEL_PROC=models/intel/vehicle-attributes-recognition-barrier-0039/vehicle-attributes-recognition-barrier-0039.json - -INC_DETECT_CMD=( - "gvadetect" - "pre-process-backend=vaapi-surface-sharing" - "model=${DET_MODEL}" - "model-proc=${DET_MODEL_PROC}" - "${DET_LABEL}" - "ie-config=CACHE_DIR=./cl_cache" - "device=GPU" -) - -#INC_TRACK_CMD=( -# "gvatrack" -# "tracking-type=short-term-imageless" -#) - - -if [[ -n "${CLS_LABEL}" ]]; -then -INC_CLASSIFY_CMD=( - "gvaclassify" - "pre-process-backend=vaapi-surface-sharing" - "model=${CLS_MODEL}" - "model-proc=${CLS_MODEL_PROC}" - "${CLS_LABEL}" - "inference-region=roi-list" - "object-class=car" - "ie-config=CACHE_DIR=./cl_cache" - "device=GPU" -) -else -INC_CLASSIFY_CMD=( - "gvaclassify" - "pre-process-backend=vaapi-surface-sharing" - "model=${CLS_MODEL}" - "model-proc=${CLS_MODEL_PROC}" - "inference-region=roi-list" - "object-class=car" - "ie-config=CACHE_DIR=./cl_cache" - "device=GPU" -) -fi - -#INC_METAPUBLISH_PIPLINE=( -# 'gvametaconvert' ! -# 'gvametapublish' -#) - -INC_WATERMARK_CMD=( - "meta_overlay" - "device=GPU" -) - -FULL_PIPELINE=( - "filesrc" "location=${VIDEO_IN}" ! - "decodebin" ! - "video/x-raw(memory:VASurface)" ! - "${INC_DETECT_CMD[@]}" ! -# "${INC_TRACK_CMD[@]}" ! - "${INC_CLASSIFY_CMD[@]}" ! -# "${INC_METAPUBLISH_PIPELINE[@]}" ! - "${INC_WATERMARK_CMD[@]}" ! - "gvafpscounter" ! - "queue" ! - "vaapih264enc" "bitrate=2048" ! - "h264parse" ! - "mp4mux" ! - "filesink" "location=/tmp/${VIDEO_OUT}" -) - -set -x -gst-launch-1.0 "${FULL_PIPELINE[@]}" diff --git a/roles/intel_media_analytics/tasks/cleanup_intel_media_analytics.yml b/roles/intel_media_analytics/tasks/cleanup_intel_media_analytics.yml index 91b2b3eb..c1a7cb9e 100644 --- a/roles/intel_media_analytics/tasks/cleanup_intel_media_analytics.yml +++ b/roles/intel_media_analytics/tasks/cleanup_intel_media_analytics.yml @@ -23,22 +23,41 @@ name: "{{ intel_media_analytics_sample_pod_name }}" namespace: "{{ intel_media_analytics_namespace }}" - - name: remove Media Analytics image from local registry + - name: remove Media Analytics image from local registry with docker block: - name: delete the tag community.docker.docker_image: state: absent name: "{{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}" - tag: "{{ intel_media_analytics_local_build_tag }}" + tag: "{{ intel_media_analytics_image_tag }}" force_absent: true when: - container_runtime == "docker" + - name: remove Media Analytics images from local registry with podman + block: + - name: delete the tag + containers.podman.podman_image: + state: absent + name: "{{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}" + tag: "{{ intel_media_analytics_image_tag }}" + when: + - container_runtime == "containerd" + - name: remove Media Analytics folder ansible.builtin.file: path: "{{ (intel_media_analytics_local_folder) | path_join }}" state: absent + - name: Remove Media Analytics base container folders + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: + - "{{ base_container_path }}" + - "{{ base_container_dockerfile_path }}" + - "{{ base_container_test_path }}" + - name: remove a k8s namespace kubernetes.core.k8s: name: "{{ intel_media_analytics_namespace }}" diff --git a/roles/intel_media_analytics/tasks/intel_media_analytics_install.yml b/roles/intel_media_analytics/tasks/intel_media_analytics_install.yml index 7b8c8f14..76e81c76 100644 --- a/roles/intel_media_analytics/tasks/intel_media_analytics_install.yml +++ b/roles/intel_media_analytics/tasks/intel_media_analytics_install.yml @@ -20,14 +20,6 @@ state: directory mode: 0755 -- name: copy Media Analytics shell script to the controller node - ansible.builtin.copy: - src: "{{ item }}" - dest: "{{ (intel_media_analytics_local_folder) | path_join }}" - mode: 0644 - with_fileglob: - - ./*.sh - - name: Copy YAML templates to the controller node for each node ansible.builtin.template: src: "templates/media_analytics_sample_pod.yaml.j2" @@ -36,28 +28,6 @@ loop: "{{ groups['kube_node'] }}" when: hostvars[item].gpu_stat_gid.stat.gid is defined -- name: copy Media Analytics Dockerfile to the controller node - ansible.builtin.template: - src: "templates/Dockerfile.j2" - dest: "{{ intel_media_analytics_local_folder}}/Dockerfile" - mode: 0644 - -# docker is used as container runtime: -- name: prepare containers images - block: - - name: prepare and push containers images - vars: - image: "{{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}" - tag: "{{ intel_media_analytics_local_build_tag }}" - ansible.builtin.shell: - cmd: |- - docker build -t {{ image }}:{{ tag }} -f Dockerfile . - docker push {{ image }}:{{ tag }} - chdir: "{{ (intel_media_analytics_local_folder) | path_join }}" - changed_when: true - when: - - container_runtime is in ['docker'] - - name: create a k8s namespace for Media Analytics kubernetes.core.k8s: name: "{{ intel_media_analytics_namespace }}" diff --git a/roles/intel_media_analytics/tasks/main.yml b/roles/intel_media_analytics/tasks/main.yml index bd1ea6d0..33ec31a4 100644 --- a/roles/intel_media_analytics/tasks/main.yml +++ b/roles/intel_media_analytics/tasks/main.yml @@ -13,12 +13,64 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -- name: Get the group ID for GPU when gpu_dp_enabled - ansible.builtin.stat: - path: /dev/dri/renderD128 - register: gpu_stat_gid +- name: Prepare Media Analytics images + block: + - name: Get the group ID for GPU when gpu_dp_enabled + ansible.builtin.stat: + path: /dev/dri/renderD128 + register: gpu_stat_gid + when: gpu_dp_enabled + + - name: create Media Analytics folder + ansible.builtin.file: + path: "{{ (intel_media_analytics_local_folder) | path_join }}" + state: directory + mode: 0755 + + - name: copy Media Analytics shell script + ansible.builtin.copy: + src: "{{ item }}" + dest: "{{ (intel_media_analytics_local_folder) | path_join }}" + mode: 0644 + with_fileglob: + - ./*.sh + + - name: copy Media Analytics Dockerfile + ansible.builtin.template: + src: "templates/Dockerfile.j2" + dest: "{{ intel_media_analytics_local_folder}}/Dockerfile" + mode: 0644 + + - name: prepare and push containers images with docker + vars: + image: "{{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}" + tag: "{{ intel_media_analytics_image_tag }}" + ansible.builtin.shell: + cmd: |- + docker build -t {{ image }}:{{ tag }} -f Dockerfile . + docker push {{ image }}:{{ tag }} + chdir: "{{ (intel_media_analytics_local_folder) | path_join }}" + changed_when: true + when: + - container_runtime is in ['docker'] + - inventory_hostname == groups['kube_node'][0] + + - name: prepare and push containers images with containerd + vars: + image: "{{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}" + tag: "{{ intel_media_analytics_image_tag }}" + containers.podman.podman_image: + name: "{{ image }}:{{ tag }}" + path: "{{ (intel_media_analytics_local_folder) | path_join }}" + build: + file: Dockerfile + extra_args: "--build-arg http_proxy={{ http_proxy }} --build-arg https_proxy={{ https_proxy }}" + push: true + when: + - container_runtime is in ['containerd'] + - inventory_hostname == groups['kube_node'][0] when: - - gpu_dp_enabled + - intel_media_analytics_enabled | default(false) - inventory_hostname in groups['kube_node'] - name: install Media Analytics diff --git a/roles/intel_media_analytics/tasks/preflight_intel_media_analytics.yml b/roles/intel_media_analytics/tasks/preflight_intel_media_analytics.yml index 7148fe32..d3de2691 100644 --- a/roles/intel_media_analytics/tasks/preflight_intel_media_analytics.yml +++ b/roles/intel_media_analytics/tasks/preflight_intel_media_analytics.yml @@ -24,13 +24,22 @@ Incorrect Media Analytics configuration !! Make sure 'gpu_dp_enalbed: true' to enable Media Analytics - - name: Media Analytics - support only docker runtime + - name: Media Analytics - support docker and containerd ansible.builtin.assert: that: - - container_runtime == 'docker' + - container_runtime is in ['docker', 'containerd'] msg: | Incorrect Media Analytics configuration !! - Make sure 'container_runtime: docker' to enable Media Analytics + Make sure 'container_runtime' is either 'docker' or 'containerd' to enable Media Analytics + + - name: Media Analytics - check if intel_base_container_enabled and build_base_images enabled + ansible.builtin.assert: + that: + - intel_base_container_enabled | default(false) + - build_base_images | default(false) + msg: | + Incorrect Media Analytics configuration !! + Make sure 'intel_base_container_enabled: true' and 'build_base_images: true' to enable Media Analytics when: - kubernetes - intel_media_analytics_enabled | default(false) diff --git a/roles/intel_media_analytics/tasks/template_dockerfile.yml b/roles/intel_media_analytics/tasks/template_dockerfile.yml index 75031395..acb17ce6 100644 --- a/roles/intel_media_analytics/tasks/template_dockerfile.yml +++ b/roles/intel_media_analytics/tasks/template_dockerfile.yml @@ -13,8 +13,22 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -- name: copy Media Analytics Dockerfile to the controller node +- name: create Media Analytics directory + ansible.builtin.file: + path: "{{ (dockerfiles_dir, 'media_analytics') | path_join }}" + state: directory + mode: '0755' + +- name: copy Media Analytics Dockerfile ansible.builtin.template: src: "Dockerfile.j2" - dest: "{{ (dockerfiles_dir, 'Dockerfile-intel_media_analytics') | path_join }}" - mode: 0644 + dest: "{{ (dockerfiles_dir, 'media_analytics', 'Dockerfile') | path_join }}" + mode: '0644' + +- name: copy Media Analytics shell script + ansible.builtin.copy: + src: "{{ item }}" + dest: "{{ (dockerfiles_dir, 'media_analytics') | path_join }}" + mode: '0644' + with_fileglob: + - ./*.sh diff --git a/roles/intel_media_analytics/templates/Dockerfile.j2 b/roles/intel_media_analytics/templates/Dockerfile.j2 index 3ba3f012..47704c2e 100755 --- a/roles/intel_media_analytics/templates/Dockerfile.j2 +++ b/roles/intel_media_analytics/templates/Dockerfile.j2 @@ -1,68 +1,18 @@ -## Copyright (c) 2020-2023 Intel Corporation. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. -## - -FROM {{ intel_media_analytics_image_src}}:{{ intel_media_analytics_image_tag }} +FROM {{ intel_media_analytics_image_src }}:{{ intel_media_analytics_image_tag }} ARG http_proxy ARG https_proxy USER root +RUN apt-get update && apt-get install -y --no-install-recommends git -RUN curl https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB -O GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - && apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - && apt-get install -y --no-install-recommends gnupg \ - && echo "deb https://apt.repos.intel.com/openvino/2022 focal main" | sudo tee /etc/apt/sources.list.d/intel-openvino-2022.list - -RUN apt-get update && apt-get install -y --no-install-recommends \ - apt-utils \ - git \ - clinfo \ - curl \ - python3-venv \ - python3-pip \ - && rm -rf /var/lib/apt/lists/* - -USER dlstreamer -ENV HOME=/home/dlstreamer - -RUN python3 -m venv venv --prompt dls2022.3 -ENV PATH="$HOME/venv/bin:$PATH" +ENV HOME=/home/{{ intel_media_analytics_pod_username }} +USER {{ intel_media_analytics_pod_username }} +WORKDIR /home/{{ intel_media_analytics_pod_username }} +COPY --chown={{ intel_media_analytics_pod_username }}:{{ intel_media_analytics_pod_username }} install-models.sh . +COPY --chown={{ intel_media_analytics_pod_username }}:{{ intel_media_analytics_pod_username }} media-analytics-test.sh . RUN python3 -m pip install --no-cache-dir -U pip \ - && pip install --no-cache-dir "openvino-dev[onnx]==2022.3.0" \ - "openvino-dev[tensorflow2]==2022.3.0" \ - "openvino-dev[pytorch]==2022.3.0" - -ENV MODEL_PATH=${HOME}/models -RUN mkdir -p ${MODEL_PATH} - -# Download classifier from OMZ and Download example video -RUN omz_downloader -o ${MODEL_PATH} --name vehicle-attributes-recognition-barrier-0039 \ - && curl --output ${MODEL_PATH}/intel/vehicle-attributes-recognition-barrier-0039/vehicle-attributes-recognition-barrier-0039.json \ - https://raw.githubusercontent.com/dlstreamer/dlstreamer/2022.3-release/samples/gstreamer/model_proc/intel/vehicle-attributes-recognition-barrier-0039.json \ - && curl -L --output cars-on-highway.1920x1080.mp4 "https://www.pexels.com/video/854671/download/?h=1080&w=1920" - -COPY --chown=dlstreamer run_vehicle_detection_attribute.sh /home/dlstreamer/run_vehicle_detection_attribute.sh -RUN chmod +x /home/dlstreamer/run_vehicle_detection_attribute.sh - -# We have 2 choices here -# 1. Run container as dlstreamer: -# We need to ensure dlstreamer has access to /dev/dri/renderD128 and /dev/dri/renderD129 via the "render" group. -# 2. Run container as root: -# Need to check to see if have violated the "least-privilege" principle -# USER dlstreamer -# USER root -CMD ["./run_vehicle_detection_attribute.sh"] + && pip install --no-cache-dir "openvino-dev==2022.3.0" +RUN chmod +x install-models.sh && chmod +x media-analytics-test.sh && ./install-models.sh HEALTHCHECK NONE diff --git a/roles/intel_media_analytics/templates/media_analytics_sample_pod.yaml.j2 b/roles/intel_media_analytics/templates/media_analytics_sample_pod.yaml.j2 index e54cc7a3..b8819be0 100644 --- a/roles/intel_media_analytics/templates/media_analytics_sample_pod.yaml.j2 +++ b/roles/intel_media_analytics/templates/media_analytics_sample_pod.yaml.j2 @@ -1,17 +1,17 @@ apiVersion: v1 kind: Pod metadata: - name: "{{ intel_media_analytics_sample_pod_name }}" + name: "{{ intel_media_analytics_sample_pod_name }}-{{ hostvars[item]['ansible_hostname'] }}" namespace: "{{ intel_media_analytics_namespace }}" spec: nodeSelector: - kubernetes.io/hostname: "{{ hostvars[item].inventory_hostname }}" + kubernetes.io/hostname: "{{ hostvars[item]['ansible_hostname'] }}" securityContext: runAsUser: 1000 runAsGroup: {{ hostvars[item].gpu_stat_gid.stat.gid }} containers: - - name: "{{ intel_media_analytics_sample_pod_name }}" - image: {{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}:{{ intel_media_analytics_local_build_tag }} + - name: "{{ intel_media_analytics_sample_pod_name }}-{{ hostvars[item]['ansible_hostname'] }}" + image: {{ registry_local_address }}/{{ intel_media_analytics_local_build_name }}:{{ intel_media_analytics_image_tag }} command: ['sh', '-c', 'echo "Hello, Media Analytics!" && sleep infinity'] {%- if gpu_dp_enabled == true %} resources: diff --git a/roles/intel_sriov_fec_operator/defaults/main.yml b/roles/intel_sriov_fec_operator/defaults/main.yml index c1aa0be0..cfc4fc48 100644 --- a/roles/intel_sriov_fec_operator/defaults/main.yml +++ b/roles/intel_sriov_fec_operator/defaults/main.yml @@ -22,9 +22,9 @@ # Intel Smart Edge Open (SEO) SRIOV-FEC Operator intel_sriov_fec_operator_git: "https://github.com/smart-edge-open/sriov-fec-operator.git" -intel_sriov_fec_operator_git_ref: "sriov-fec-operator-23.34" +intel_sriov_fec_operator_git_ref: "02565627335dc5f55a12f648182de7a29282b93e" # TODO changeme when tag sriov-fec-operator-23.50 is out intel_sriov_fec_operator_dir: "{{ (project_root_dir, 'intel-sriov-fec-operator') | path_join }}" -intel_sriov_fec_operator_img_ver: "2.7.1" +intel_sriov_fec_operator_img_ver: "2.8.0" intel_sriov_fec_operator_tool: "{{ 'docker' if container_runtime == 'docker' else 'podman' }}" intel_sriov_fec_operator_make_tls: "false" # intel_sriov_fec_operator_target_platform: "K8S" @@ -39,7 +39,7 @@ fec_acc_dev: "{{ fec_acc }}" # Operator Package Manager (OPM) opm_url: "https://github.com/operator-framework/operator-registry/releases/download/{{ opm_ver }}/linux-amd64-opm" -opm_ver: "v1.28.0" -opm_chk: "e18e5abc8febb63c9dc76db0f33475553d98495465bd2dca81c39dcdbc875c08" +opm_ver: "v1.32.0" +opm_chk: "9cee2f0057f1a39960df0d391ca565a1d5f95ea75ca62b2a39a9dfcd05c8955d" opm_dir: "/usr/local/bin/" opm_cmd: "opm" diff --git a/roles/intel_sriov_fec_operator/files/flexran_acc200.yaml b/roles/intel_sriov_fec_operator/files/flexran_acc200.yaml deleted file mode 100644 index ab3e36c6..00000000 --- a/roles/intel_sriov_fec_operator/files/flexran_acc200.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - labels: - app: flexran-nr-pod - name: flexran-nr-pod - annotations: - k8s.v1.cni.cncf.io/networks: '[ - ]' -spec: - containers: - - name: flexran-container - securityContext: - privileged: true - capabilities: - add: - - SYS_ADMIN - - IPC_LOCK - - SYS_NICE - image: localhost/flexran-builder:latest - imagePullPolicy: Never - command: - - sleep - - infinity - tty: true - stdin: true - resources: - requests: - memory: "8Gi" - hugepages-1Gi: 8Gi - intel.com/intel_fec_acc200: '1' - limits: - memory: "8Gi" - hugepages-1Gi: 8Gi - intel.com/intel_fec_acc200: '1' - volumeMounts: - - name: hugepage - mountPath: /hugepages - - name: varrun - mountPath: /var/run/dpdk - readOnly: false - - name: sys - mountPath: /sys/ - readOnly: false - - name: test-bbdev-app - mountPath: /opt/backups/wireless-dpdk-ae - readOnly: false - volumes: - - name: test-bbdev-app - hostPath: - path: "/opt/backups/wireless-dpdk-ae" - mountPath: - - name: sys - hostPath: - path: "/sys" - - name: hugepage - emptyDir: - medium: HugePages - - name: varrun - emptyDir: {} diff --git a/roles/intel_sriov_fec_operator/files/flexran_acc200_vfio.yaml b/roles/intel_sriov_fec_operator/files/flexran_acc200_vfio.yaml deleted file mode 100644 index 420bbbe7..00000000 --- a/roles/intel_sriov_fec_operator/files/flexran_acc200_vfio.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - labels: - app: flexran-nr-pod - name: flexran-nr-pod - annotations: - k8s.v1.cni.cncf.io/networks: '[ - ]' -spec: - containers: - - name: flexran-container - securityContext: - privileged: true - capabilities: - add: - - SYS_ADMIN - - IPC_LOCK - - SYS_NICE - image: localhost/flexran-builder:latest - imagePullPolicy: Never - command: - - sleep - - infinity - tty: true - stdin: true - env: - - name: SRIOV_FEC_VFIO_TOKEN - value: "eb09e62c-a911-44c1-b94f-e974f3c935c6" - resources: - requests: - memory: "8Gi" - hugepages-1Gi: 8Gi - intel.com/intel_fec_acc200: '1' - limits: - memory: "8Gi" - hugepages-1Gi: 8Gi - intel.com/intel_fec_acc200: '1' - volumeMounts: - - name: hugepage - mountPath: /hugepages - - name: varrun - mountPath: /var/run/dpdk - readOnly: false - - name: sys - mountPath: /sys/ - readOnly: false - - name: test-bbdev-app - mountPath: /opt/backups/wireless-dpdk-ae - readOnly: false - volumes: - - name: test-bbdev-app - hostPath: - path: "/opt/backups/wireless-dpdk-ae" - mountPath: - - name: sys - hostPath: - path: "/sys" - - name: hugepage - emptyDir: - medium: HugePages - - name: varrun - emptyDir: {} diff --git a/roles/intel_sriov_fec_operator/tasks/preflight_sriov_fec_operator.yml b/roles/intel_sriov_fec_operator/tasks/preflight_sriov_fec_operator.yml index 4d663c04..9fbf11f1 100644 --- a/roles/intel_sriov_fec_operator/tasks/preflight_sriov_fec_operator.yml +++ b/roles/intel_sriov_fec_operator/tasks/preflight_sriov_fec_operator.yml @@ -28,7 +28,7 @@ - name: SRIOV-FEC Operator - check distro ansible.builtin.assert: - that: ansible_distribution_version == "22.04" or ansible_distribution_version == "9.2" + that: ansible_distribution_version is version('22.04', '==') or ansible_distribution_version is version('9.2', '==') fail_msg: "Deploying Intel SR-IOV FEC Operator is supported only on Ubuntu 22.04 or RHEL 9.2. Please change the o/s or correct group_vars configuration" # noqa yaml[line-length] success_msg: "Assertion passed. Intel SR-IOV FEC Operator is supported and can be deployed on '{{ ansible_distribution }}' distro" diff --git a/roles/intel_sriov_fec_operator/tasks/sriov_fec_operator.yml b/roles/intel_sriov_fec_operator/tasks/sriov_fec_operator.yml index 474c00d6..b24f84e1 100644 --- a/roles/intel_sriov_fec_operator/tasks/sriov_fec_operator.yml +++ b/roles/intel_sriov_fec_operator/tasks/sriov_fec_operator.yml @@ -34,7 +34,7 @@ - name: Add /sys to volumeMounts ansible.builtin.lineinfile: path: "{{ intel_sriov_fec_operator_dir }}/assets/300-daemon.yaml" - insertafter: "mountPath: /lib/modules" + insertbefore: "mountPath: /sriov_config/config" line: " - name: sys\n mountPath: /sys" - name: Add /sys to volumes ansible.builtin.lineinfile: diff --git a/roles/intel_sriov_fec_operator/templates/catalog.yml.j2 b/roles/intel_sriov_fec_operator/templates/catalog.yml.j2 index 66407e7e..186d94c4 100644 --- a/roles/intel_sriov_fec_operator/templates/catalog.yml.j2 +++ b/roles/intel_sriov_fec_operator/templates/catalog.yml.j2 @@ -5,6 +5,8 @@ metadata: namespace: olm spec: sourceType: grpc + grpcPodConfig: + securityContextConfig: restricted image: "{{ intel_sriov_fec_operator_catalog_image }}" publisher: Intel displayName: SRIOV FEC Operators (Local) diff --git a/roles/intel_xpumanager/tasks/main.yml b/roles/intel_xpumanager/tasks/main.yml index 58f9acc9..679336cc 100644 --- a/roles/intel_xpumanager/tasks/main.yml +++ b/roles/intel_xpumanager/tasks/main.yml @@ -27,7 +27,7 @@ name: create_signed_k8s_certs vars: secret_name: "{{ rbac_proxy_ssl_secret_name }}" - service_name: xpumanager + service_name: intel-xpumanager key_pair_name: xpumanager-rbac-proxy host_secrets_folder: "{{ rbac_proxy_ssl_mount_path }}" k8s_namespace: "{{ xpumanager_namespace }}" diff --git a/roles/intel_xpumanager/tasks/xpumanager_preflight.yml b/roles/intel_xpumanager/tasks/xpumanager_preflight.yml index 6e336e6f..88f30ff1 100644 --- a/roles/intel_xpumanager/tasks/xpumanager_preflight.yml +++ b/roles/intel_xpumanager/tasks/xpumanager_preflight.yml @@ -16,19 +16,14 @@ --- - name: preflight xpumanager for k8s env block: - - name: check if Observability stack is enabled + - name: check if Prometheus stack is enabled ansible.builtin.assert: that: - - prometheus_enabled | d(false) - - telegraf_enabled | d(false) - - jaeger_operator | d(false) - - opentelemetry_enabled | d(false) - - elasticsearch_enabled | d(false) - - kibana_enabled | d(false) + - prometheus_stack_enabled | d(false) msg: | Incorrect configuration !! - XPUManager requires Onservability stack. - Please enable prometheus_enabled, telegraf_enabled, jaeger_operator, opentelemetry_enabled, elasticsearch_enabled, kibana_enabled in group_vars + XPUManager requires Prometheus stack to be deployed. + Please enable prometheus_stack_enabled in group_vars - name: check if GPU Device Plugin is enabled and configured ansible.builtin.assert: @@ -57,10 +52,10 @@ - name: check if Observability stack is enabled ansible.builtin.assert: that: - - prometheus_enabled | d(false) + - prometheus_stack_enabled | d(false) msg: | Incorrect configuration !! XPUManager requires Onservability stack. - Please enable prometheus_enabled in group_vars + Please enable prometheus_stack_enabled in group_vars when: - not kubernetes | default(false) | bool diff --git a/roles/ipu/common/tasks/main.yml b/roles/ipu/common/tasks/main.yml index 27eaaecc..3e810429 100644 --- a/roles/ipu/common/tasks/main.yml +++ b/roles/ipu/common/tasks/main.yml @@ -21,8 +21,8 @@ - name: check supported OS for IPU ansible.builtin.assert: that: - - (ansible_distribution == "Rocky" and ansible_distribution_version == "9.1") or - (ansible_distribution == "Rocky" and ansible_distribution_version == "9.2") or + - (ansible_distribution == "Rocky" and ansible_distribution_version is version('9.1', '==')) or + (ansible_distribution == "Rocky" and ansible_distribution_version is version('9.2', '==')) or (ansible_distribution == "Fedora") fail_msg: - "Current OS - {{ ansible_distribution }} {{ ansible_distribution_version }} - is not supported for IPU" diff --git a/roles/ipu/flash_ipu_nvm/tasks/main.yml b/roles/ipu/flash_ipu_nvm/tasks/main.yml index 6dc0aab1..d08b023e 100644 --- a/roles/ipu/flash_ipu_nvm/tasks/main.yml +++ b/roles/ipu/flash_ipu_nvm/tasks/main.yml @@ -31,30 +31,26 @@ ansible.builtin.find: path: "{{ ftdi_sio_driver_dir }}" file_type: "link" - register: ftdi_sio_out -- name: set USB device addresses - ansible.builtin.command: - cmd: "basename {{ ftdi_sio_out.files|selectattr('path', 'search', item)|map(attribute='path')|replace('[','')|replace(']','') }}" - with_items: "{{ usb_address_patterns }}" - register: selected_usb_addresses - when: - - ftdi_sio_out.files|selectattr('path', 'search', item)|map(attribute='path')|replace('[','')|replace(']','') +- name: find selected USB device binding + ansible.builtin.find: + path: "{{ ftdi_sio_driver_dir }}" + file_type: "link" + patterns: "{{ usb_address_patterns }}" + use_regex: true + recurse: false + register: ftdi_sio_out -- name: show selected USB device addresses to be unbind - ansible.builtin.debug: - msg: "USB: {{ item.stdout }}" - with_items: "{{ selected_usb_addresses.results }}" - when: - - not item.skipped | default(false) +- name: Construct USB device binding list + ansible.builtin.set_fact: + selected_usb_addresses: "{{ ftdi_sio_out.files | map(attribute='path') | map('basename') | list }}" - name: unbind USB1 and USB3 from ftdi_sio driver - ansible.builtin.shell: "set -o pipefail && echo '{{ item.stdout }}' > {{ ftdi_sio_driver_dir }}/unbind" + ansible.builtin.shell: "set -o pipefail && echo '{{ item }}' > {{ ftdi_sio_driver_dir }}/unbind" args: executable: /bin/bash - with_items: "{{ selected_usb_addresses.results }}" - when: - - not item.skipped | default(false) + with_items: "{{ selected_usb_addresses }}" + changed_when: false - name: add executable permission for EthProgrammer ansible.builtin.file: @@ -72,7 +68,7 @@ async: 1200 # Maximum allowed timeout in Seconds poll: 10 # Polling Interval in Seconds environment: - DOTNET_SYSTEM_GLOBALIZATION_INVARIANT: 1 + DOTNET_SYSTEM_GLOBALIZATION_INVARIANT: "1" - name: IPU images are flashed ansible.builtin.debug: diff --git a/roles/istio_service_mesh/charts/istioctl/values.yaml b/roles/istio_service_mesh/charts/istioctl/values.yaml index b240ef45..b6bdc252 100644 --- a/roles/istio_service_mesh/charts/istioctl/values.yaml +++ b/roles/istio_service_mesh/charts/istioctl/values.yaml @@ -52,10 +52,6 @@ resources: nodeSelector: {} tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "" - effect: "NoSchedule" - key: "node-role.kubernetes.io/control-plane" operator: "Equal" value: "" @@ -64,12 +60,6 @@ tolerations: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - preference: - matchExpressions: - - key: "node-role.kubernetes.io/master" - operator: In - values: [""] - weight: 1 preference: matchExpressions: diff --git a/roles/istio_service_mesh/tasks/cleanup.yml b/roles/istio_service_mesh/tasks/cleanup.yml index 2805b29e..4351ea66 100644 --- a/roles/istio_service_mesh/tasks/cleanup.yml +++ b/roles/istio_service_mesh/tasks/cleanup.yml @@ -28,7 +28,7 @@ command: | kubectl exec \ --namespace {{ istio_service_mesh.istio_namespace }} \ - {{ istioctl_pod.stdout }} -- istioctl x uninstall --purge -y + {{ istioctl_pod.stdout }} -- istioctl uninstall --purge -y failed_when: false changed_when: true when: diff --git a/roles/istio_service_mesh/tasks/main.yml b/roles/istio_service_mesh/tasks/main.yml index ca5cb5bb..de095342 100644 --- a/roles/istio_service_mesh/tasks/main.yml +++ b/roles/istio_service_mesh/tasks/main.yml @@ -69,14 +69,7 @@ modprobe: name: "{{ item }}" state: present - with_items: - - "br_netfilter" - - "nf_nat" - - "xt_REDIRECT" - - "xt_owner" - - "iptable_nat" - - "iptable_mangle" - - "iptable_filter" + with_items: "{{ lookup('file', '../files/istio-netfilter.conf').splitlines() }}" - name: make module loading for iptables persistent copy: src: "istio-netfilter.conf" diff --git a/roles/istio_service_mesh/templates/tcpip-bypass-ebpf.yaml.j2 b/roles/istio_service_mesh/templates/tcpip-bypass-ebpf.yaml.j2 index e850963e..967189b5 100644 --- a/roles/istio_service_mesh/templates/tcpip-bypass-ebpf.yaml.j2 +++ b/roles/istio_service_mesh/templates/tcpip-bypass-ebpf.yaml.j2 @@ -17,8 +17,6 @@ spec: tolerations: # this toleration is to have the daemonset runnable on master nodes # remove it if your masters can't run pods - - key: node-role.kubernetes.io/master - effect: NoSchedule - key: node-role.kubernetes.io/control-plane effect: NoSchedule containers: diff --git a/roles/istio_service_mesh/vars/main.yml b/roles/istio_service_mesh/vars/main.yml index 6c521acc..53e5edee 100644 --- a/roles/istio_service_mesh/vars/main.yml +++ b/roles/istio_service_mesh/vars/main.yml @@ -16,7 +16,7 @@ istio_service_mesh_defaults: enabled: false image: istio/istioctl - version: 1.19.0 + version: 1.20.1 intel_preview: enabled: false image: intel/istioctl diff --git a/roles/jaeger_install/defaults/main.yml b/roles/jaeger_install/defaults/main.yml index 478cf405..66f4e553 100644 --- a/roles/jaeger_install/defaults/main.yml +++ b/roles/jaeger_install/defaults/main.yml @@ -13,7 +13,7 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -jaeger_version: v1.49.0 +jaeger_version: v1.51.0 jaeger_crd_url: https://github.com/jaegertracing/jaeger-operator/releases/download/{{ jaeger_version }}/jaeger-operator.yaml jaeger_annotations_key_to_remove: 'sidecar.jaegertracing.io/inject' jaeger_query_remove: >- diff --git a/roles/jaeger_install/files/jaeger_deployment.yml b/roles/jaeger_install/files/jaeger_deployment.yml index d7a9b1e4..4d51a388 100644 --- a/roles/jaeger_install/files/jaeger_deployment.yml +++ b/roles/jaeger_install/files/jaeger_deployment.yml @@ -5,6 +5,8 @@ metadata: namespace: monitoring spec: strategy: production + ingress: + enabled: false collector: maxReplicas: 1 resources: @@ -13,7 +15,7 @@ spec: memory: 500Mi options: es: - server-urls: https://elasticsearch-master.monitoring.svc:9200 + server-urls: https://elasticsearch-main-es-http.monitoring.svc:9200 index-prefix: jaeger_ version: 7 # Necessary as it doesn't work with 8 yet create-index-templates: false # Necessary as it doesn't work with 8 yet @@ -28,7 +30,7 @@ spec: query: base-path: /jaeger es: - server-urls: https://elasticsearch-master.monitoring.svc:9200 + server-urls: https://elasticsearch-main-es-http.monitoring.svc:9200 index-prefix: jaeger_ version: 7 # Necessary as it doesn't work with 8 yet create-index-templates: false # Necessary as it doesn't work with 8 yet @@ -41,7 +43,7 @@ spec: type: elasticsearch options: es: - server-urls: https://elasticsearch-master.monitoring.svc:9200 + server-urls: https://elasticsearch-main-es-http.monitoring.svc:9200 index-prefix: jaeger_ version: 7 # Necessary as it doesn't work with 8 yet create-index-templates: false # Necessary as it doesn't work with 8 yet diff --git a/roles/jaeger_install/tasks/main.yml b/roles/jaeger_install/tasks/main.yml index 3c93b181..9ecf676d 100644 --- a/roles/jaeger_install/tasks/main.yml +++ b/roles/jaeger_install/tasks/main.yml @@ -55,8 +55,8 @@ - name: Get Elasticsearch credentials ansible.builtin.shell: >- - kubectl get secrets --namespace=monitoring - elasticsearch-master-credentials -ojsonpath='{.data.password}' | base64 -d + set -o pipefail && kubectl get secrets --namespace=monitoring + elasticsearch-main-es-elastic-user -ojsonpath='{.data.elastic}' | base64 -d changed_when: false register: elastic_pass args: diff --git a/roles/elasticsearch_install/defaults/main.yml b/roles/jaeger_install/tasks/preflight.yml similarity index 75% rename from roles/elasticsearch_install/defaults/main.yml rename to roles/jaeger_install/tasks/preflight.yml index 874a2f4b..6c3e03a7 100644 --- a/roles/elasticsearch_install/defaults/main.yml +++ b/roles/jaeger_install/tasks/preflight.yml @@ -13,8 +13,9 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -elasticsearch_chart_repo: "https://helm.elastic.co" -elasticsearch_chart_name: "elastic" -elasticsearch_chart_version: "8.5.1" -elasticsearch_namespace: "monitoring" -elasticsearch_release_name: "elasticsearch" +- name: Check dependencies for Jaeger + ansible.builtin.assert: + that: + - eck_enabled | default(false) + fail_msg: + "When Jaeger is enabled, elasticsearch must be enabled as well." diff --git a/roles/kibana_install/defaults/main.yml b/roles/kibana_install/defaults/main.yml index fbf67295..917f936d 100644 --- a/roles/kibana_install/defaults/main.yml +++ b/roles/kibana_install/defaults/main.yml @@ -13,6 +13,8 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## +kibana_chart_repo: "https://helm.elastic.co" +kibana_chart_name: "elastic" kibana_chart_version: "8.5.1" kibana_namespace: "monitoring" kibana_release_name: "kibana" diff --git a/roles/kibana_install/tasks/main.yml b/roles/kibana_install/tasks/main.yml index e7765410..6c04da85 100644 --- a/roles/kibana_install/tasks/main.yml +++ b/roles/kibana_install/tasks/main.yml @@ -17,18 +17,41 @@ when: - inventory_hostname == groups['kube_control_plane'][0] block: + - name: Get Elasticsearch credentials + ansible.builtin.shell: >- + set -o pipefail && kubectl get secrets --namespace=monitoring + elasticsearch-main-es-elastic-user -ojsonpath='{.data.elastic}' + changed_when: false + register: elastic_pass + args: + executable: /bin/bash + - name: create kibana folder ansible.builtin.file: state: directory dest: "{{ (project_root_dir, 'kibana') | path_join }}" mode: 0755 - - name: populate kibana values file and push to controller node + - name: populate kibana template files and push to controller node ansible.builtin.template: - src: "kibana_values.yml.j2" - dest: "{{ (project_root_dir, 'kibana', 'kibana_values.yml') | path_join }}" + src: "{{ item }}" + dest: "{{ project_root_dir }}/kibana/{{ item | basename | regex_replace('.j2','') }}" force: yes mode: preserve + with_fileglob: + - ../templates/*.j2 + + - name: create kibana elastic secret + kubernetes.core.k8s: + state: present + src: "{{ (project_root_dir, 'kibana', item) | path_join }}" + loop: + - kibana_elastic_secret.yml + + - name: add elasticsearch chart repo + kubernetes.core.helm_repository: + name: "{{ kibana_chart_name }}" + repo_url: "{{ kibana_chart_repo }}" - name: deploy kibana kubernetes.core.helm: diff --git a/roles/kubernetes_ingress_install/tasks/preflight_kubernetes_ingress.yml b/roles/kibana_install/tasks/preflight.yml similarity index 74% rename from roles/kubernetes_ingress_install/tasks/preflight_kubernetes_ingress.yml rename to roles/kibana_install/tasks/preflight.yml index c963a10e..f4bb1a3b 100644 --- a/roles/kubernetes_ingress_install/tasks/preflight_kubernetes_ingress.yml +++ b/roles/kibana_install/tasks/preflight.yml @@ -13,10 +13,9 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## - -# - block: - # - name: preflight kubernetes ingress controller installation - # include_role: - # name: kubernetes_ingress_install - # tasks_from: preflight_kubernetes_ingress - # any_errors_fatal: true +- name: Check dependencies for Kibana + ansible.builtin.assert: + that: + - eck_enabled | default(false) + fail_msg: + "When Kibana is enabled, elasticsearch must be enabled as well." diff --git a/roles/kibana_install/templates/kibana_elastic_secret.yml.j2 b/roles/kibana_install/templates/kibana_elastic_secret.yml.j2 new file mode 100644 index 00000000..5994730a --- /dev/null +++ b/roles/kibana_install/templates/kibana_elastic_secret.yml.j2 @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: "kibana-elastic-credentials" + namespace: "{{ kibana_namespace }}" +data: + password: "{{ elastic_pass.stdout_lines[0] }}" + username: "ZWxhc3RpYw==" diff --git a/roles/kibana_install/templates/kibana_values.yml.j2 b/roles/kibana_install/templates/kibana_values.yml.j2 index 7b06479d..3c872b65 100644 --- a/roles/kibana_install/templates/kibana_values.yml.j2 +++ b/roles/kibana_install/templates/kibana_values.yml.j2 @@ -1,8 +1,8 @@ --- -elasticsearchHosts: "https://elasticsearch-master.monitoring.svc:9200" +elasticsearchHosts: "https://elasticsearch-main-es-http.monitoring.svc:9200" elasticsearchCertificateSecret: elasticsearch-tls elasticsearchCertificateAuthoritiesFile: ca.crt -elasticsearchCredentialSecret: elasticsearch-master-credentials +elasticsearchCredentialSecret: kibana-elastic-credentials replicas: 1 @@ -64,10 +64,10 @@ updateStrategy: type: "Recreate" service: - type: NodePort + type: ClusterIP loadBalancerIP: "" port: 5601 - nodePort: 30001 + # nodePort: 30001 labels: {} annotations: {} # cloud.google.com/load-balancer-type: "Internal" diff --git a/roles/kmra_install/tasks/create_cosign_tls_secrets.yml b/roles/kmra_install/tasks/create_cosign_tls_secrets.yml index 7efa01e4..1d59d9d0 100644 --- a/roles/kmra_install/tasks/create_cosign_tls_secrets.yml +++ b/roles/kmra_install/tasks/create_cosign_tls_secrets.yml @@ -93,9 +93,10 @@ - name: set fact of all secrets ansible.builtin.set_fact: - "cosign_{{ item['source'] | basename | replace('.','')}}": "{{ item['content'] | replace(\"'\",'') }}" - no_log: true + cosign_secrets: "{{ cosign_secrets | default({}) | \ + combine({ item['source'] | basename | replace('.','_'): item['content'] | replace(\"'\",'') }) }}" loop: "{{ secret_files.results }}" + no_log: true - name: create provider and operator secrets for cosign kubernetes.core.k8s: @@ -108,12 +109,13 @@ name: "{{ item.name }}-cosign" namespace: "{{ item.namespace | default(kmra.namespace) }}" data: - cosign.ca: "{{ cosign_cacrt }}" - cosign.cert: "{{ hostvars[inventory_hostname]['cosign_' + item.name + 'crt'] }}" - cosign.key: "{{ hostvars[inventory_hostname]['cosign_' + item.name + 'cosignkey'] }}" - cosign.pub: "{{ hostvars[inventory_hostname]['cosign_' + item.name + 'cosignpub'] }}" + cosign.ca: "{{ cosign_secrets['ca_crt'] }}" + cosign.cert: "{{ cosign_secrets[item.name + '_crt'] }}" + cosign.key: "{{ cosign_secrets[item.name + '_cosign_key'] }}" + cosign.pub: "{{ cosign_secrets[item.name + '_cosign_pub'] }}" stringData: cosign.password: "{{ cosign_password }}" + no_log: true loop: "{{ secrets }}" - name: create pubkey secret for policy-controller @@ -127,7 +129,8 @@ name: "{{ item.name }}-cosign-pubkey" namespace: "{{ cosign_namespace }}" data: - cosign.pub: "{{ hostvars[inventory_hostname]['cosign_' + item.name + 'cosignpub'] }}" + cosign.pub: "{{ cosign_secrets[item.name + '_cosign_pub'] }}" + no_log: true loop: "{{ secrets }}" - name: clean up tmp directory diff --git a/roles/kmra_install/tasks/create_custom_tls_configmap.yml b/roles/kmra_install/tasks/create_custom_tls_configmap.yml index 20371088..2107cf32 100644 --- a/roles/kmra_install/tasks/create_custom_tls_configmap.yml +++ b/roles/kmra_install/tasks/create_custom_tls_configmap.yml @@ -118,9 +118,10 @@ - name: set fact of all secrets ansible.builtin.set_fact: - "custom_tls_{{ item['source'] | basename | replace('.','')}}": "{{ item['content'] | replace(\"'\",'') | b64decode }}" - no_log: true + custom_tls_secrets: "{{ custom_tls_secrets | default({}) | \ + combine({ item['source'] | basename | replace('.','_'): item['content'] | replace(\"'\",'') | b64decode }) }}" loop: "{{ secret_files.results }}" + no_log: true - name: create configmap for the kmra app custom-tls kubernetes.core.k8s: @@ -132,14 +133,15 @@ name: kmra-apphsm-custom-config namespace: "{{ kmra.namespace }}" data: - server_cu.key: "{{ custom_tls_token_server_cukey }}" - server_cu.crt: "{{ custom_tls_token_server_cucrt }}" - server_du.key: "{{ custom_tls_token_server_dukey }}" - server_du.crt: "{{ custom_tls_token_server_ducrt }}" - server_ric.key: "{{ custom_tls_token_server_rickey }}" - server_ric.crt: "{{ custom_tls_token_server_riccrt }}" - client.key: "{{ custom_tls_token_clientkey }}" - client.crt: "{{ custom_tls_token_clientcrt }}" + server_cu.key: "{{ custom_tls_secrets['token_server_cu_key'] }}" + server_cu.crt: "{{ custom_tls_secrets['token_server_cu_crt'] }}" + server_du.key: "{{ custom_tls_secrets['token_server_du_key'] }}" + server_du.crt: "{{ custom_tls_secrets['token_server_du_crt'] }}" + server_ric.key: "{{ custom_tls_secrets['token_server_ric_key'] }}" + server_ric.crt: "{{ custom_tls_secrets['token_server_ric_crt'] }}" + client.key: "{{ custom_tls_secrets['token_client_key'] }}" + client.crt: "{{ custom_tls_secrets['token_client_crt'] }}" + no_log: true - name: clean up tmp directory ansible.builtin.file: @@ -157,20 +159,22 @@ - name: get ca crt base64 ansible.builtin.shell: >- set -o pipefail && - echo -n "{{ custom_tls_cacrt }}" | openssl x509 -outform der | base64 -w0 + echo -n "{{ custom_tls_secrets['ca_crt'] }}" | openssl x509 -outform der | base64 -w0 register: ca_crt_str args: executable: /bin/bash changed_when: true + no_log: true - name: get client crt base64 ansible.builtin.shell: >- set -o pipefail && - echo -n "{{ custom_tls_token_clientcrt }}" | openssl x509 -outform der | base64 -w0 + echo -n "{{ custom_tls_secrets['token_client_crt'] }}" | openssl x509 -outform der | base64 -w0 register: client_crt_str args: executable: /bin/bash changed_when: true + no_log: true - name: populate tls_truststore.xml ansible.builtin.template: @@ -181,11 +185,12 @@ - name: get client finger ansible.builtin.shell: >- set -o pipefail && - echo -n "{{ custom_tls_cacrt }}" | openssl x509 -noout -fingerprint | cut -b '18-' + echo -n "{{ custom_tls_secrets['ca_crt'] }}" | openssl x509 -noout -fingerprint | cut -b '18-' register: finger_str args: executable: /bin/bash changed_when: true + no_log: true - name: populate tls_listen.xml ansible.builtin.template: @@ -198,18 +203,18 @@ no_log: true loop: - { - key: "{{ custom_tls_token_server_cukey }}", - crt: "{{ custom_tls_token_server_cucrt }}", + key: "{{ custom_tls_secrets['token_server_cu_key'] }}", + crt: "{{ custom_tls_secrets['token_server_cu_crt'] }}", xml: 'tls_keystore_cu.xml' } - { - key: "{{ custom_tls_token_server_dukey }}", - crt: "{{ custom_tls_token_server_ducrt }}", + key: "{{ custom_tls_secrets['token_server_du_key'] }}", + crt: "{{ custom_tls_secrets['token_server_du_crt'] }}", xml: 'tls_keystore_du.xml' } - { - key: "{{ custom_tls_token_server_rickey }}", - crt: "{{ custom_tls_token_server_riccrt }}", + key: "{{ custom_tls_secrets['token_server_ric_key'] }}", + crt: "{{ custom_tls_secrets['token_server_ric_crt'] }}", xml: 'tls_keystore_ric.xml' } @@ -223,15 +228,16 @@ - name: read all sysrepo files ansible.builtin.slurp: src: "{{ item.path }}" - register: sysrepo_files + register: sysrepo_files_content no_log: true loop: "{{ sysrepo_list.files }}" - name: set fact of all sysrepo files ansible.builtin.set_fact: - "sysrepo_{{ item['source'] | basename | replace('.','')}}": "{{ item['content'] | replace(\"'\",'') | b64decode }}" + sysrepo_files: "{{ sysrepo_files | default({}) | \ + combine({ item['source'] | basename | replace('.','_'): item['content'] | replace(\"'\",'') | b64decode }) }}" + loop: "{{ sysrepo_files_content.results }}" no_log: true - loop: "{{ sysrepo_files.results }}" - name: create configmap for the netopeer2 app kubernetes.core.k8s: @@ -244,26 +250,26 @@ namespace: "{{ cosign_enforce_namespace }}" data: tls_keystore.xml: "{{ item.xml }}" - tls_listen.xml: "{{ sysrepo_tls_listenxml }}" - tls_truststore.xml: "{{ sysrepo_tls_truststorexml }}" + tls_listen.xml: "{{ sysrepo_files['tls_listen_xml'] }}" + tls_truststore.xml: "{{ sysrepo_files['tls_truststore_xml'] }}" no_log: true loop: - { key: "{{ kmra.oran_netopeer2_server.release_name }}-cu", - xml: "{{ sysrepo_tls_keystore_cuxml }}" + xml: "{{ sysrepo_files['tls_keystore_cu_xml'] }}" } - { key: "{{ kmra.oran_netopeer2_server.release_name }}-du", - xml: "{{ sysrepo_tls_keystore_duxml }}" + xml: "{{ sysrepo_files['tls_keystore_du_xml'] }}" } - { key: "{{ kmra.oran_netopeer2_server.release_name }}-ric", - xml: "{{ sysrepo_tls_keystore_ricxml }}" + xml: "{{ sysrepo_files['tls_keystore_ric_xml'] }}" } # just reuse cu keystore for client sysrepo format check only - { key: "{{ kmra.oran_netopeer2_client.release_name }}", - xml: "{{ sysrepo_tls_keystore_cuxml }}" + xml: "{{ sysrepo_files['tls_keystore_cu_xml'] }}" } - name: clean up tmp directory diff --git a/roles/kmra_install/tasks/kmra_oran_preflight.yml b/roles/kmra_install/tasks/kmra_oran_preflight.yml index 3cc5c05d..a3d9a356 100644 --- a/roles/kmra_install/tasks/kmra_oran_preflight.yml +++ b/roles/kmra_install/tasks/kmra_oran_preflight.yml @@ -48,7 +48,7 @@ - name: check the oran image integrity assert: - that: "provided_oran.stat.checksum == '{{ kmra_oran.oran.oran_image_checksum }}'" + that: provided_oran.stat.checksum == kmra_oran.oran.oran_image_checksum msg: - File {{ kmra_oran.oran.oran_image_staging_location }} on localhost is NOT the expected one. - Please provide the correct file. diff --git a/roles/kmra_install/tasks/kmra_sbx_preflight.yml b/roles/kmra_install/tasks/kmra_sbx_preflight.yml index 9b27a9d0..a9dbf454 100644 --- a/roles/kmra_install/tasks/kmra_sbx_preflight.yml +++ b/roles/kmra_install/tasks/kmra_sbx_preflight.yml @@ -37,7 +37,7 @@ - name: check the sbx image integrity assert: - that: "provided_sbx.stat.checksum == '{{ kmra_defaults.apphsm.sbx_image_checksum }}'" + that: provided_sbx.stat.checksum == kmra_defaults.apphsm.sbx_image_checksum msg: - File {{ kmra_defaults.apphsm.sbx_image_staging_location }} on localhost is NOT the expected one. - Please provide the correct file. diff --git a/roles/kmra_install/tasks/main.yml b/roles/kmra_install/tasks/main.yml index 83743e86..186d749b 100644 --- a/roles/kmra_install/tasks/main.yml +++ b/roles/kmra_install/tasks/main.yml @@ -38,13 +38,12 @@ name: sgx_prv state: present when: - - (ansible_distribution == "Ubuntu" and ansible_distribution_version >= '21.04') - or (ansible_os_family == "RedHat" and ansible_distribution_version >= '8.4') + - (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('21.04', '>=')) + or (ansible_os_family == "RedHat" and ansible_distribution_version is version('8.4', '>=')) - - name: determine sgx_prv GID + - name: determine sgx_prv and sgx ID ansible.builtin.getent: database: group - key: sgx_prv when: - kmra.ctk_loadkey_demo.enabled or kmra.oran.enabled | default (false) - inventory_hostname == groups['kube_node'][0] @@ -227,11 +226,13 @@ - item.deploy - name: install KMRA pccs helm chart - command: >- - helm upgrade -i {{ kmra.pccs.release_name }} - --namespace {{ kmra.namespace }} - -f {{ kmra.pccs.helm_values_file }} - {{ kmra.pccs.chart_path }} + kubernetes.core.helm: + chart_ref: "{{ kmra.pccs.chart_path }}" + release_name: "{{ kmra.pccs.release_name }}" + release_namespace: "{{ kmra.namespace }}" + values_files: "{{ kmra.pccs.helm_values_file }}" + create_namespace: true + force: true when: - kmra.pccs.enabled | default(false) @@ -252,11 +253,13 @@ - kmra.oran.enabled | default(false) - name: install KMRA AppHSM helm chart - command: >- - helm upgrade -i {{ kmra.apphsm.release_name }} - --namespace {{ kmra.namespace }} - -f {{ kmra.apphsm.helm_values_file }} - {{ kmra.apphsm.chart_path }} + kubernetes.core.helm: + chart_ref: "{{ kmra.apphsm.chart_path }}" + release_name: "{{ kmra.apphsm.release_name }}" + release_namespace: "{{ kmra.namespace }}" + values_files: "{{ kmra.apphsm.helm_values_file }}" + create_namespace: true + force: true when: - kmra.apphsm.enabled | default(false) @@ -272,11 +275,13 @@ wait_timeout: 600 - name: install KMRA Ctk loadkey helm chart - command: >- - helm upgrade -i {{ kmra.ctk_loadkey_demo.release_name }} - --namespace {{ kmra.namespace }} - -f {{ kmra.ctk_loadkey_demo.helm_values_file }} - {{ kmra.ctk_loadkey_demo.chart_path }} + kubernetes.core.helm: + chart_ref: "{{ kmra.ctk_loadkey_demo.chart_path }}" + release_name: "{{ kmra.ctk_loadkey_demo.release_name }}" + release_namespace: "{{ kmra.namespace }}" + values_files: "{{ kmra.ctk_loadkey_demo.helm_values_file }}" + create_namespace: true + force: true when: - kmra.ctk_loadkey_demo.enabled | default(false) diff --git a/roles/kubernetes_ingress_install/tasks/cleanup_kubernetes_ingress.yml b/roles/kubernetes_ingress_install/tasks/cleanup_kubernetes_ingress.yml deleted file mode 100644 index b60039df..00000000 --- a/roles/kubernetes_ingress_install/tasks/cleanup_kubernetes_ingress.yml +++ /dev/null @@ -1,35 +0,0 @@ -## -## Copyright (c) 2020-2023 Intel Corporation. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. -## ---- -- block: - - name: delete Kubernetes Ingress Controller Helm Charts - command: >- - helm delete {{ kubernetes_ingress_release_name }} --namespace {{ kubernetes_ingress_helm_chart_release_namespace }} - when: - - inventory_hostname == groups['kube_control_plane'][0] - changed_when: false - failed_when: false - - name: delete Kubernetes Ingress Controller Helm Repo - command: >- - helm repo remove {{ kubernetes_ingress_helm_chart_repo_name }} - when: - - inventory_hostname == groups['kube_control_plane'][0] - changed_when: false - failed_when: false - tags: - - minio - when: - - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/kubernetes_ingress_install/tasks/kubernetes_ingress_install.yml b/roles/kubernetes_ingress_install/tasks/kubernetes_ingress_install.yml deleted file mode 100755 index c0a24a05..00000000 --- a/roles/kubernetes_ingress_install/tasks/kubernetes_ingress_install.yml +++ /dev/null @@ -1,66 +0,0 @@ -## -## Copyright (c) 2020-2023 Intel Corporation. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. -## ---- -- block: - - name: check Kubernetes-Ingress Helm charts directory. - stat: - path: "{{ (project_root_dir, 'charts', 'kubernetes-ingress') | path_join }}" - register: kubernetes_ingress_path - - - name: create Kubernetes-Ingress Helm charts directory if needed - file: - path: "{{ (project_root_dir, 'charts', 'kubernetes-ingress') | path_join }}" - state: directory - mode: 0755 - when: - - kubernetes_ingress_path.stat.exists is defined and not kubernetes_ingress_path.stat.exists - - - name: check Kubernetes-Ingress Helm charts temp directory. - stat: - path: "{{ (project_root_dir, 'charts', 'kubernetes-ingress', 'temp') | path_join }}" - register: kubernetes_ingress_temp_dir - - - name: create the temp folder for Kubernetes-Ingress custom values - file: - path: "{{ (project_root_dir, 'charts', 'kubernetes-ingress', 'temp') | path_join }}" - state: directory - mode: 0755 - when: - - not kubernetes_ingress_temp_dir.stat.exists - - - name: populate Kubernetes-Ingress Helm charts values template and push to controller node - template: - src: "kubernetes_ingress_custom_values.yml.j2" - dest: "{{ (project_root_dir, 'charts', 'kubernetes-ingress', 'temp', 'kubernetes-ingress-custom-values.yml') | path_join }}" - force: yes - mode: preserve - - - name: Add Kubernetes Ingress Controller Helm Chart Repository - command: >- - helm repo add "{{ kubernetes_ingress_helm_chart_repo_name }}" "{{ kubernetes_ingress_helm_repo_url }}" - changed_when: true - - - name: Deploy {{ kubernetes_ingress_helm_chart_version }} of {{ kubernetes_ingress_helm_chart_ref }} - command: >- - helm install - {{ kubernetes_ingress_release_name }} - {{ kubernetes_ingress_helm_chart_ref }} - --namespace {{ kubernetes_ingress_helm_chart_release_namespace }} - --create-namespace - -f {{ (project_root_dir, 'charts', 'kubernetes-ingress', 'temp', 'kubernetes-ingress-custom-values.yml') | path_join }} - changed_when: true - when: - - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/kubernetes_ingress_install/templates/kubernetes_ingress_custom_values.yml.j2 b/roles/kubernetes_ingress_install/templates/kubernetes_ingress_custom_values.yml.j2 deleted file mode 100755 index 80f31f01..00000000 --- a/roles/kubernetes_ingress_install/templates/kubernetes_ingress_custom_values.yml.j2 +++ /dev/null @@ -1,375 +0,0 @@ -controller: - ## The name of the Ingress Controller daemonset or deployment. - ## Autogenerated if not set or set to "". - # name: nginx-ingress - - ## The kind of the Ingress Controller installation - deployment or daemonset. - kind: deployment - - ## Deploys the Ingress Controller for NGINX Plus. - nginxplus: false - - # Timeout in milliseconds which the Ingress Controller will wait for a successful NGINX reload after a change or at the initial start. - nginxReloadTimeout: 60000 - - ## Support for App Protect - appprotect: - ## Enable the App Protect module in the Ingress Controller. - enable: false - ## Sets log level for App Protect. Allowed values: fatal, error, warn, info, debug, trace - logLevel: debug - - ## Support for App Protect Dos - appprotectdos: - ## Enable the App Protect Dos module in the Ingress Controller. - enable: false - ## Enable debugging for App Protect Dos. - debug: false - ## Max number of nginx processes to support. - maxWorkers: 0 - ## Max number of ADMD instances. - maxDaemons: 0 - ## RAM memory size to consume in MB. - memory: 0 - - ## Enables the Ingress Controller pods to use the host's network namespace. - hostNetwork: false - - ## Enables debugging for NGINX. Uses the nginx-debug binary. Requires error-log-level: debug in the ConfigMap via `controller.config.entries`. - nginxDebug: false - - ## The log level of the Ingress Controller. - logLevel: 4 - - ## A list of custom ports to expose on the NGINX ingress controller pod. Follows the conventional Kubernetes yaml syntax for container ports. - customPorts: [] - - image: - ## The image repository of the Ingress Controller. - repository: nginx/nginx-ingress - - ## The tag of the Ingress Controller image. - tag: "2.3.1" - - ## The pull policy for the Ingress Controller image. - pullPolicy: IfNotPresent - - config: - ## The name of the ConfigMap used by the Ingress Controller. - ## Autogenerated if not set or set to "". - # name: nginx-config - - ## The annotations of the Ingress Controller configmap. - annotations: {} - - ## The entries of the ConfigMap for customizing NGINX configuration. - entries: {} - - ## It is recommended to use your own TLS certificates and keys - defaultTLS: - ## The base64-encoded TLS certificate for the default HTTPS server. If not specified, a pre-generated self-signed certificate is used. - ## Note: It is recommended that you specify your own certificate. - cert: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUN2akNDQWFZQ0NRREFPRjl0THNhWFhEQU5CZ2txaGtpRzl3MEJBUXNGQURBaE1SOHdIUVlEVlFRRERCWk8KUjBsT1dFbHVaM0psYzNORGIyNTBjbTlzYkdWeU1CNFhEVEU0TURreE1qRTRNRE16TlZvWERUSXpNRGt4TVRFNApNRE16TlZvd0lURWZNQjBHQTFVRUF3d1dUa2RKVGxoSmJtZHlaWE56UTI5dWRISnZiR3hsY2pDQ0FTSXdEUVlKCktvWklodmNOQVFFQkJRQURnZ0VQQURDQ0FRb0NnZ0VCQUwvN2hIUEtFWGRMdjNyaUM3QlBrMTNpWkt5eTlyQ08KR2xZUXYyK2EzUDF0azIrS3YwVGF5aGRCbDRrcnNUcTZzZm8vWUk1Y2Vhbkw4WGM3U1pyQkVRYm9EN2REbWs1Qgo4eDZLS2xHWU5IWlg0Rm5UZ0VPaStlM2ptTFFxRlBSY1kzVnNPazFFeUZBL0JnWlJVbkNHZUtGeERSN0tQdGhyCmtqSXVuektURXUyaDU4Tlp0S21ScUJHdDEwcTNRYzhZT3ExM2FnbmovUWRjc0ZYYTJnMjB1K1lYZDdoZ3krZksKWk4vVUkxQUQ0YzZyM1lma1ZWUmVHd1lxQVp1WXN2V0RKbW1GNWRwdEMzN011cDBPRUxVTExSakZJOTZXNXIwSAo1TmdPc25NWFJNV1hYVlpiNWRxT3R0SmRtS3FhZ25TZ1JQQVpQN2MwQjFQU2FqYzZjNGZRVXpNQ0F3RUFBVEFOCkJna3Foa2lHOXcwQkFRc0ZBQU9DQVFFQWpLb2tRdGRPcEsrTzhibWVPc3lySmdJSXJycVFVY2ZOUitjb0hZVUoKdGhrYnhITFMzR3VBTWI5dm15VExPY2xxeC9aYzJPblEwMEJCLzlTb0swcitFZ1U2UlVrRWtWcitTTFA3NTdUWgozZWI4dmdPdEduMS9ienM3bzNBaS9kclkrcUI5Q2k1S3lPc3FHTG1US2xFaUtOYkcyR1ZyTWxjS0ZYQU80YTY3Cklnc1hzYktNbTQwV1U3cG9mcGltU1ZmaXFSdkV5YmN3N0NYODF6cFErUyt1eHRYK2VBZ3V0NHh3VlI5d2IyVXYKelhuZk9HbWhWNThDd1dIQnNKa0kxNXhaa2VUWXdSN0diaEFMSkZUUkk3dkhvQXprTWIzbjAxQjQyWjNrN3RXNQpJUDFmTlpIOFUvOWxiUHNoT21FRFZkdjF5ZytVRVJxbStGSis2R0oxeFJGcGZnPT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= - - ## The base64-encoded TLS key for the default HTTPS server. Note: If not specified, a pre-generated key is used. - ## Note: It is recommended that you specify your own key. - key: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFcEFJQkFBS0NBUUVBdi91RWM4b1JkMHUvZXVJTHNFK1RYZUprckxMMnNJNGFWaEMvYjVyYy9XMlRiNHEvClJOcktGMEdYaVN1eE9ycXgrajlnamx4NXFjdnhkenRKbXNFUkJ1Z1B0ME9hVGtIekhvb3FVWmcwZGxmZ1dkT0EKUTZMNTdlT1l0Q29VOUZ4amRXdzZUVVRJVUQ4R0JsRlNjSVo0b1hFTkhzbysyR3VTTWk2Zk1wTVM3YUhudzFtMApxWkdvRWEzWFNyZEJ6eGc2clhkcUNlUDlCMXl3VmRyYURiUzc1aGQzdUdETDU4cGszOVFqVUFQaHpxdmRoK1JWClZGNGJCaW9CbTVpeTlZTW1hWVhsMm0wTGZzeTZuUTRRdFFzdEdNVWozcGJtdlFmazJBNnljeGRFeFpkZFZsdmwKMm82MjBsMllxcHFDZEtCRThCay90elFIVTlKcU56cHpoOUJUTXdJREFRQUJBb0lCQVFDZklHbXowOHhRVmorNwpLZnZJUXQwQ0YzR2MxNld6eDhVNml4MHg4Mm15d1kxUUNlL3BzWE9LZlRxT1h1SENyUlp5TnUvZ2IvUUQ4bUFOCmxOMjRZTWl0TWRJODg5TEZoTkp3QU5OODJDeTczckM5bzVvUDlkazAvYzRIbjAzSkVYNzZ5QjgzQm9rR1FvYksKMjhMNk0rdHUzUmFqNjd6Vmc2d2szaEhrU0pXSzBwV1YrSjdrUkRWYmhDYUZhNk5nMUZNRWxhTlozVDhhUUtyQgpDUDNDeEFTdjYxWTk5TEI4KzNXWVFIK3NYaTVGM01pYVNBZ1BkQUk3WEh1dXFET1lvMU5PL0JoSGt1aVg2QnRtCnorNTZud2pZMy8yUytSRmNBc3JMTnIwMDJZZi9oY0IraVlDNzVWYmcydVd6WTY3TWdOTGQ5VW9RU3BDRkYrVm4KM0cyUnhybnhBb0dCQU40U3M0ZVlPU2huMVpQQjdhTUZsY0k2RHR2S2ErTGZTTXFyY2pOZjJlSEpZNnhubmxKdgpGenpGL2RiVWVTbWxSekR0WkdlcXZXaHFISy9iTjIyeWJhOU1WMDlRQ0JFTk5jNmtWajJTVHpUWkJVbEx4QzYrCk93Z0wyZHhKendWelU0VC84ajdHalRUN05BZVpFS2FvRHFyRG5BYWkyaW5oZU1JVWZHRXFGKzJyQW9HQkFOMVAKK0tZL0lsS3RWRzRKSklQNzBjUis3RmpyeXJpY05iWCtQVzUvOXFHaWxnY2grZ3l4b25BWlBpd2NpeDN3QVpGdwpaZC96ZFB2aTBkWEppc1BSZjRMazg5b2pCUmpiRmRmc2l5UmJYbyt3TFU4NUhRU2NGMnN5aUFPaTVBRHdVU0FkCm45YWFweUNweEFkREtERHdObit3ZFhtaTZ0OHRpSFRkK3RoVDhkaVpBb0dCQUt6Wis1bG9OOTBtYlF4VVh5YUwKMjFSUm9tMGJjcndsTmVCaWNFSmlzaEhYa2xpSVVxZ3hSZklNM2hhUVRUcklKZENFaHFsV01aV0xPb2I2NTNyZgo3aFlMSXM1ZUtka3o0aFRVdnpldm9TMHVXcm9CV2xOVHlGanIrSWhKZnZUc0hpOGdsU3FkbXgySkJhZUFVWUNXCndNdlQ4NmNLclNyNkQrZG8wS05FZzFsL0FvR0FlMkFVdHVFbFNqLzBmRzgrV3hHc1RFV1JqclRNUzRSUjhRWXQKeXdjdFA4aDZxTGxKUTRCWGxQU05rMXZLTmtOUkxIb2pZT2pCQTViYjhibXNVU1BlV09NNENoaFJ4QnlHbmR2eAphYkJDRkFwY0IvbEg4d1R0alVZYlN5T294ZGt5OEp0ek90ajJhS0FiZHd6NlArWDZDODhjZmxYVFo5MWpYL3RMCjF3TmRKS2tDZ1lCbyt0UzB5TzJ2SWFmK2UwSkN5TGhzVDQ5cTN3Zis2QWVqWGx2WDJ1VnRYejN5QTZnbXo5aCsKcDNlK2JMRUxwb3B0WFhNdUFRR0xhUkcrYlNNcjR5dERYbE5ZSndUeThXczNKY3dlSTdqZVp2b0ZpbmNvVlVIMwphdmxoTUVCRGYxSjltSDB5cDBwWUNaS2ROdHNvZEZtQktzVEtQMjJhTmtsVVhCS3gyZzR6cFE9PQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQo= - - ## The secret with a TLS certificate and key for the default HTTPS server. - ## The value must follow the following format: `/`. - ## Used as an alternative to specifying a certificate and key using `controller.defaultTLS.cert` and `controller.defaultTLS.key` parameters. - ## Format: / - secret: - - wildcardTLS: - ## The base64-encoded TLS certificate for every Ingress/VirtualServer host that has TLS enabled but no secret specified. - ## If the parameter is not set, for such Ingress/VirtualServer hosts NGINX will break any attempt to establish a TLS connection. - cert: "" - - ## The base64-encoded TLS key for every Ingress/VirtualServer host that has TLS enabled but no secret specified. - ## If the parameter is not set, for such Ingress/VirtualServer hosts NGINX will break any attempt to establish a TLS connection. - key: "" - - ## The secret with a TLS certificate and key for every Ingress/VirtualServer host that has TLS enabled but no secret specified. - ## The value must follow the following format: `/`. - ## Used as an alternative to specifying a certificate and key using `controller.wildcardTLS.cert` and `controller.wildcardTLS.key` parameters. - ## Format: / - secret: - - ## The node selector for pod assignment for the Ingress Controller pods. - nodeSelector: {} - - ## The termination grace period of the Ingress Controller pod. - terminationGracePeriodSeconds: 30 - - ## The resources of the Ingress Controller pods. - resources: - requests: - cpu: 100m - memory: 128Mi - # limits: - # cpu: 1 - # memory: 1Gi - - - ## The tolerations of the Ingress Controller pods. - tolerations: [] - - ## The affinity of the Ingress Controller pods. - affinity: {} - - ## The topology spread constraints of the Ingress controller pods. - topologySpreadConstraints: {} - - ## The volumes of the Ingress Controller pods. - volumes: [] - # - name: extra-conf - # configMap: - # name: extra-conf - - ## The volumeMounts of the Ingress Controller pods. - volumeMounts: [] - # - name: extra-conf - # mountPath: /etc/nginx/conf.d/extra.conf - # subPath: extra.conf - - ## InitContainers for the Ingress Controller pods. - initContainers: [] - # - name: init-container - # image: busybox:1.34 - # command: ['sh', '-c', 'echo this is initial setup!'] - - ## The minimum number of seconds for which a newly created Pod should be ready without any of its containers crashing, for it to be considered available. - minReadySeconds: 0 - - ## Strategy used to replace old Pods by new ones. .spec.strategy.type can be "Recreate" or "RollingUpdate". "RollingUpdate" is the default value. - strategy: {} - - ## Extra containers for the Ingress Controller pods. - extraContainers: [] - # - name: container - # image: busybox:1.34 - # command: ['sh', '-c', 'echo this is a sidecar!'] - - ## The number of replicas of the Ingress Controller deployment. - replicaCount: 1 - - ## A class of the Ingress Controller. - - ## IngressClass resource with the name equal to the class must be deployed. Otherwise, - ## the Ingress Controller will fail to start. - ## The Ingress Controller only processes resources that belong to its class - i.e. have the "ingressClassName" field resource equal to the class. - - ## The Ingress Controller processes all the resources that do not have the "ingressClassName" field for all versions of kubernetes. - ingressClass: nginx - - ## New Ingresses without an ingressClassName field specified will be assigned the class specified in `controller.ingressClass`. - setAsDefaultIngress: false - - ## Namespace to watch for Ingress resources. By default the Ingress Controller watches all namespaces. - watchNamespace: "" - - ## Enable the custom resources. - enableCustomResources: true - - ## Enable preview policies. This parameter is deprecated. To enable OIDC Policies please use controller.enableOIDC instead. - enablePreviewPolicies: false - - ## Enable OIDC policies. - enableOIDC: false - - ## Enable TLS Passthrough on port 443. Requires controller.enableCustomResources. - enableTLSPassthrough: false - - ## Enable cert manager for Virtual Server resources. Requires controller.enableCustomResources. - enableCertManager: false - - ## Enable external DNS for Virtual Server resources. Requires controller.enableCustomResources. - enableExternalDNS: false - - globalConfiguration: - ## Creates the GlobalConfiguration custom resource. Requires controller.enableCustomResources. - create: false - - ## The spec of the GlobalConfiguration for defining the global configuration parameters of the Ingress Controller. - spec: {} - # listeners: - # - name: dns-udp - # port: 5353 - # protocol: UDP - # - name: dns-tcp - # port: 5353 - # protocol: TCP - - ## Enable custom NGINX configuration snippets in Ingress, VirtualServer, VirtualServerRoute and TransportServer resources. - enableSnippets: false - - ## Add a location based on the value of health-status-uri to the default server. The location responds with the 200 status code for any request. - ## Useful for external health-checking of the Ingress Controller. - healthStatus: false - - ## Sets the URI of health status location in the default server. Requires controller.healthStatus. - healthStatusURI: "/nginx-health" - - nginxStatus: - ## Enable the NGINX stub_status, or the NGINX Plus API. - enable: true - - ## Set the port where the NGINX stub_status or the NGINX Plus API is exposed. - port: 8080 - - ## Add IPv4 IP/CIDR blocks to the allow list for NGINX stub_status or the NGINX Plus API. Separate multiple IP/CIDR by commas. - allowCidrs: "127.0.0.1" - - service: - ## Creates a service to expose the Ingress Controller pods. - create: true - - ## The type of service to create for the Ingress Controller. - type: NodePort - - ## The externalTrafficPolicy of the service. The value Local preserves the client source IP. - externalTrafficPolicy: Local - - ## The annotations of the Ingress Controller service. - annotations: {} - - ## The extra labels of the service. - extraLabels: {} - - ## The static IP address for the load balancer. Requires controller.service.type set to LoadBalancer. The cloud provider must support this feature. - loadBalancerIP: "" - - ## The list of external IPs for the Ingress Controller service. - externalIPs: [] - - ## The IP ranges (CIDR) that are allowed to access the load balancer. Requires controller.service.type set to LoadBalancer. The cloud provider must support this feature. - loadBalancerSourceRanges: [] - - ## The name of the service - ## Autogenerated if not set or set to "". - # name: nginx-ingress - - ## Whether to automatically allocate NodePorts (only for LoadBalancers). - # allocateLoadBalancerNodePorts: true - - ## Dual stack preference. - ## Valid values: SingleStack, PreferDualStack, RequireDualStack - # ipFamilyPolicy: SingleStack - - ## List of IP families assigned to this service. - ## Valid values: IPv4, IPv6 - # ipFamilies: - # - IPv6 - - httpPort: - ## Enables the HTTP port for the Ingress Controller service. - enable: true - - ## The HTTP port of the Ingress Controller service. - port: 80 - - ## The custom NodePort for the HTTP port. Requires controller.service.type set to NodePort. - nodePort: "30123" - - ## The HTTP port on the POD where the Ingress Controller service is running. - targetPort: 80 - - httpsPort: - ## Enables the HTTPS port for the Ingress Controller service. - enable: true - - ## The HTTPS port of the Ingress Controller service. - port: 443 - - ## The custom NodePort for the HTTPS port. Requires controller.service.type set to NodePort. - nodePort: "30124" - - ## The HTTPS port on the POD where the Ingress Controller service is running. - targetPort: 443 - - ## A list of custom ports to expose through the Ingress Controller service. Follows the conventional Kubernetes yaml syntax for service ports. - customPorts: [] - - serviceAccount: - ## The name of the service account of the Ingress Controller pods. Used for RBAC. - ## Autogenerated if not set or set to "". - # name: nginx-ingress - - ## The name of the secret containing docker registry credentials. - ## Secret must exist in the same namespace as the helm release. - imagePullSecretName: "" - - reportIngressStatus: - ## Updates the address field in the status of Ingress resources with an external address of the Ingress Controller. - ## You must also specify the source of the external address either through an external service via controller.reportIngressStatus.externalService, - ## controller.reportIngressStatus.ingressLink or the external-status-address entry in the ConfigMap via controller.config.entries. - ## Note: controller.config.entries.external-status-address takes precedence over the others. - enable: true - - ## Specifies the name of the service with the type LoadBalancer through which the Ingress Controller is exposed externally. - ## The external address of the service is used when reporting the status of Ingress, VirtualServer and VirtualServerRoute resources. - ## controller.reportIngressStatus.enable must be set to true. - ## The default is autogenerated and matches the created service (see controller.service.create). - # externalService: nginx-ingress - - ## Specifies the name of the IngressLink resource, which exposes the Ingress Controller pods via a BIG-IP system. - ## The IP of the BIG-IP system is used when reporting the status of Ingress, VirtualServer and VirtualServerRoute resources. - ## controller.reportIngressStatus.enable must be set to true. - ingressLink: "" - - ## Enable Leader election to avoid multiple replicas of the controller reporting the status of Ingress resources. controller.reportIngressStatus.enable must be set to true. - enableLeaderElection: true - - ## Specifies the name of the ConfigMap, within the same namespace as the controller, used as the lock for leader election. controller.reportIngressStatus.enableLeaderElection must be set to true. - ## Autogenerated if not set or set to "". - # leaderElectionLockName: "nginx-ingress-leader-election" - - ## The annotations of the leader election configmap. - annotations: {} - - pod: - ## The annotations of the Ingress Controller pod. - annotations: {} - - ## The additional extra labels of the Ingress Controller pod. - extraLabels: {} - - ## The PriorityClass of the ingress controller pods. - priorityClassName: - - readyStatus: - ## Enables readiness endpoint "/nginx-ready". The endpoint returns a success code when NGINX has loaded all the config after startup. - enable: true - - ## Set the port where the readiness endpoint is exposed. - port: 8081 - - ## Enable collection of latency metrics for upstreams. Requires prometheus.create. - enableLatencyMetrics: false - -rbac: - ## Configures RBAC. - create: true - -prometheus: - ## Expose NGINX or NGINX Plus metrics in the Prometheus format. - create: true - - ## Configures the port to scrape the metrics. - port: 9113 - - ## Specifies the namespace/name of a Kubernetes TLS Secret which will be used to protect the Prometheus endpoint. - secret: "" - - ## Configures the HTTP scheme used. - scheme: http - -nginxServiceMesh: - ## Enables integration with NGINX Service Mesh. - ## Requires controller.nginxplus - enable: false - - ## Enables NGINX Service Mesh workload to route egress traffic through the Ingress Controller. - ## Requires nginxServiceMesh.enable - enableEgress: false diff --git a/roles/kubernetes_power_manager/tasks/deploy_features.yml b/roles/kubernetes_power_manager/tasks/deploy_features.yml index e23b2315..2cd04f8c 100644 --- a/roles/kubernetes_power_manager/tasks/deploy_features.yml +++ b/roles/kubernetes_power_manager/tasks/deploy_features.yml @@ -59,3 +59,25 @@ state: present src: "{{ (kubernetes_power_manager_dir, 'cstates_' + power_node + '.yaml') | path_join }}" when: inventory_hostname == groups['kube_control_plane'][0] + +# Time of Day +- name: prepare and deploy Time of Day + when: + - hostvars[power_node]['time_of_day']['enabled'] | default(false) + - inventory_hostname == groups['kube_control_plane'][0] + block: + - name: prepare required variables to deploy Time of Day + ansible.builtin.set_fact: + time_of_day: "{{ hostvars[power_node]['time_of_day'] }}" + + - name: populate Time of Day template + ansible.builtin.template: + src: time_of_day.yaml.j2 + dest: "{{ (kubernetes_power_manager_dir, 'time_of_day_' + power_node + '.yaml') | path_join }}" + force: yes + mode: preserve + + - name: apply Time of Day + kubernetes.core.k8s: + state: present + src: "{{ (kubernetes_power_manager_dir, 'time_of_day_' + power_node + '.yaml') | path_join }}" diff --git a/roles/kubernetes_power_manager/tasks/kpm_preflight.yml b/roles/kubernetes_power_manager/tasks/kpm_preflight.yml index c7bf5fcb..ae78c11c 100644 --- a/roles/kubernetes_power_manager/tasks/kpm_preflight.yml +++ b/roles/kubernetes_power_manager/tasks/kpm_preflight.yml @@ -47,7 +47,7 @@ - name: check global scaling governor ansible.builtin.assert: - that: kubernetes_power_manager.global_governor in {{ available_governors }} + that: kubernetes_power_manager.global_governor in available_governors fail_msg: - "{{ kubernetes_power_manager.global_governor }} governor is not supported" when: @@ -57,7 +57,7 @@ - name: check local scaling governor ansible.builtin.assert: - that: local_shared_profile.local_governor in {{ available_governors }} + that: local_shared_profile.local_governor in available_governors fail_msg: - "{{ local_shared_profile.local_governor }} governor is not supported" when: @@ -67,9 +67,9 @@ - name: check scaling driver ansible.builtin.assert: - that: frequency_scaling_driver is defined and frequency_scaling_driver == "acpi_cpufreq" + that: frequency_scaling_driver is defined and frequency_scaling_driver == "intel_cpufreq" fail_msg: - - "Governors {{ acpi_only_governors }} are only available with acpi_cpufreq scaling driver. + - "Governors {{ cpufreq_only_governors }} are only available with intel_cpufreq scaling driver. Please change scaling driver in host vars." when: - kubernetes_power_manager.global_governor is defined and kubernetes_power_manager.global_governor in ["userspace", "schedutil"] or diff --git a/roles/kubernetes_power_manager/tasks/main.yml b/roles/kubernetes_power_manager/tasks/main.yml index 98571bb0..ab7f9bdd 100644 --- a/roles/kubernetes_power_manager/tasks/main.yml +++ b/roles/kubernetes_power_manager/tasks/main.yml @@ -19,7 +19,15 @@ name: install_dependencies - name: add labels for Power Nodes - ansible.builtin.command: kubectl label nodes {{ hostvars[item]['ansible_hostname'] }} intel.power.node=true --overwrite + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Node + metadata: + name: "{{ hostvars[item]['ansible_hostname'] }}" + labels: + intel.power.node: 'true' loop: "{{ kubernetes_power_manager.power_nodes }}" when: inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/kubernetes_power_manager/tasks/power_manager.yml b/roles/kubernetes_power_manager/tasks/power_manager.yml index 24b18ac8..e9f66788 100644 --- a/roles/kubernetes_power_manager/tasks/power_manager.yml +++ b/roles/kubernetes_power_manager/tasks/power_manager.yml @@ -124,7 +124,7 @@ state: present src: "{{ (kubernetes_power_manager_dir, 'config', 'rbac', 'rbac.yaml') | path_join }}" - # WA: go mod tidy is needed, until upstream issue is fixed. + # WA: go mod tidy is needed, until upstream issue is fixed. Checked 10. 01. 2024 - name: run go mod tidy ansible.builtin.command: "go mod tidy -v" args: diff --git a/roles/kubernetes_power_manager/templates/sample_power_pod.yaml.j2 b/roles/kubernetes_power_manager/templates/sample_power_pod.yaml.j2 index 28fd1d8a..78dc3cf3 100644 --- a/roles/kubernetes_power_manager/templates/sample_power_pod.yaml.j2 +++ b/roles/kubernetes_power_manager/templates/sample_power_pod.yaml.j2 @@ -22,4 +22,4 @@ spec: cpu: "2" power.intel.com/{{ power_profile_name }}: "2" nodeSelector: - kubernetes.io/hostname: {{ power_node }} + kubernetes.io/hostname: {{ hostvars[power_node]['ansible_hostname'] }} diff --git a/roles/kubernetes_power_manager/templates/shared_workload.yaml.j2 b/roles/kubernetes_power_manager/templates/shared_workload.yaml.j2 index 99624fd7..5538bc92 100644 --- a/roles/kubernetes_power_manager/templates/shared_workload.yaml.j2 +++ b/roles/kubernetes_power_manager/templates/shared_workload.yaml.j2 @@ -10,5 +10,5 @@ spec: # IMPORTANT: The CPUs in reservedCPUs should match the value of the reserved system CPUs in your Kubelet config file reservedCPUs: {{ reserved_cpus }} powerNodeSelector: - kubernetes.io/hostname: {{ node_name }} + kubernetes.io/hostname: {{ hostvars[node_name]['ansible_hostname'] }} powerProfile: shared-{{ shared_workload_type }} diff --git a/roles/kubernetes_power_manager/templates/time_of_day.yaml.j2 b/roles/kubernetes_power_manager/templates/time_of_day.yaml.j2 new file mode 100644 index 00000000..0128e23a --- /dev/null +++ b/roles/kubernetes_power_manager/templates/time_of_day.yaml.j2 @@ -0,0 +1,9 @@ +apiVersion: "power.intel.com/v1" +kind: TimeOfDay +metadata: + name: timeofday-sample + namespace: {{ kubernetes_power_manager_namespace }} +spec: + timeZone: {{ time_of_day.time_zone }} + schedule: {{ time_of_day.schedule }} + reservedCPUs: {{ time_of_day.reserved_cpus }} diff --git a/roles/kubernetes_power_manager/templates/uncore_frequency.yaml.j2 b/roles/kubernetes_power_manager/templates/uncore_frequency.yaml.j2 index 0bb1bff5..10ccf301 100644 --- a/roles/kubernetes_power_manager/templates/uncore_frequency.yaml.j2 +++ b/roles/kubernetes_power_manager/templates/uncore_frequency.yaml.j2 @@ -1,4 +1,4 @@ -apiVersion: power.intel.com/v1 +apiVersion: "power.intel.com/v1" kind: Uncore metadata: name: {{ power_node }} diff --git a/roles/kubernetes_power_manager/vars/main.yml b/roles/kubernetes_power_manager/vars/main.yml index 74a34cbf..3774a7e9 100644 --- a/roles/kubernetes_power_manager/vars/main.yml +++ b/roles/kubernetes_power_manager/vars/main.yml @@ -31,5 +31,5 @@ combined_profiles: [] multiplier: '{{ [kubernetes_power_manager.power_nodes | length / 20, 1.0] | min }}' # 20+ nodes will double the max basic value common_governors: ["performance", "powersave"] -acpi_only_governors: ["userspace", "schedutil"] -available_governors: "{{ common_governors + acpi_only_governors }}" +cpufreq_only_governors: ["userspace", "schedutil"] +available_governors: "{{ common_governors + cpufreq_only_governors }}" diff --git a/roles/kubespray_patch/files/kubernetes_core.patch b/roles/kubespray_patch/files/kubernetes_core.patch new file mode 100644 index 00000000..e136e897 --- /dev/null +++ b/roles/kubespray_patch/files/kubernetes_core.patch @@ -0,0 +1,13 @@ +diff --git a/plugins/module_utils/client/discovery.py b/plugins/module_utils/client/discovery.py +index 848d4bd..aaff2e6 100644 +--- a/plugins/module_utils/client/discovery.py ++++ b/plugins/module_utils/client/discovery.py +@@ -113,7 +113,7 @@ class Discoverer(kubernetes.dynamic.discovery.Discoverer): + filter(lambda resource: "/" in resource["name"], resources_response) + ) + for subresource in subresources_raw: +- resource, name = subresource["name"].split("/") ++ resource, name = subresource["name"].split("/")[:2] + subresources[resource][name] = subresource + + for resource in resources_raw: diff --git a/roles/kubespray_patch/files/kubespray_delay_wait.patch b/roles/kubespray_patch/files/kubespray_delay_wait.patch index 33986ee1..6f96a4e2 100644 --- a/roles/kubespray_patch/files/kubespray_delay_wait.patch +++ b/roles/kubespray_patch/files/kubespray_delay_wait.patch @@ -1,18 +1,18 @@ diff --git a/roles/kubernetes/preinstall/handlers/main.yml b/roles/kubernetes/preinstall/handlers/main.yml -index 631ea743e..333930b3e 100644 +index 35140ab42..9d60bc1b8 100644 --- a/roles/kubernetes/preinstall/handlers/main.yml +++ b/roles/kubernetes/preinstall/handlers/main.yml -@@ -9,6 +9,7 @@ +@@ -21,6 +21,7 @@ - Preinstall | restart kube-controller-manager crio/containerd - Preinstall | restart kube-apiserver docker - Preinstall | restart kube-apiserver crio/containerd + - Preinstall | delay wait for the apiserver to be running - - Preinstall | wait for the apiserver to be running - when: not ansible_os_family in ["Flatcar", "Flatcar Container Linux by Kinvolk"] and not is_fedora_coreos - -@@ -107,6 +108,17 @@ - - resolvconf_mode == 'host_resolvconf' + when: not dns_early | bool + listen: + - Preinstall | propagate resolvconf to k8s components +@@ -105,6 +106,18 @@ - kube_apiserver_set.stat.exists + listen: Preinstall | propagate resolvconf to k8s components +# Ensure apiserver is already restarting before wait is started +- name: Preinstall | delay wait for the apiserver to be running @@ -24,6 +24,7 @@ index 631ea743e..333930b3e 100644 + - dns_mode != 'none' + - resolvconf_mode == 'host_resolvconf' + - kube_apiserver_set.stat.exists ++ listen: Preinstall | propagate resolvconf to k8s components + # When running this as the last phase ensure we wait for kube-apiserver to come up - name: Preinstall | wait for the apiserver to be running diff --git a/roles/kubespray_patch/files/kubespray_unsafe_template.patch b/roles/kubespray_patch/files/kubespray_unsafe_template.patch new file mode 100644 index 00000000..dafa024f --- /dev/null +++ b/roles/kubespray_patch/files/kubespray_unsafe_template.patch @@ -0,0 +1,13 @@ +diff --git a/roles/kubernetes/preinstall/tasks/0040-verify-settings.yml b/roles/kubernetes/preinstall/tasks/0040-verify-settings.yml +index 178becf3e..83193ccaa 100644 +--- a/roles/kubernetes/preinstall/tasks/0040-verify-settings.yml ++++ b/roles/kubernetes/preinstall/tasks/0040-verify-settings.yml +@@ -1,7 +1,7 @@ + --- + - name: Stop if either kube_control_plane or kube_node group is empty + assert: +- that: "groups.get('{{ item }}')" ++ that: groups.get(item) + with_items: + - kube_control_plane + - kube_node diff --git a/roles/kubespray_patch/tasks/main.yml b/roles/kubespray_patch/tasks/main.yml index a2aab8cc..2529e1e9 100644 --- a/roles/kubespray_patch/tasks/main.yml +++ b/roles/kubespray_patch/tasks/main.yml @@ -32,9 +32,26 @@ src: "files/kubespray_delay_wait.patch" dest: "{{ kubespray_dir }}/roles/kubernetes/preinstall/handlers/main.yml" +- name: apply patch to kubernetes.core module + ansible.posix.patch: + src: "files/kubernetes_core.patch" + dest: "{{ kubernetes_module_dir }}/core/plugins/module_utils/client/discovery.py" + +# WA - to be removed at next kubespray bump! +- name: apply patch to fix unsafe templating + ansible.posix.patch: + src: "files/kubespray_unsafe_template.patch" + dest: "{{ kubespray_dir }}/roles/kubernetes/preinstall/tasks/0040-verify-settings.yml" + - name: Load patch checksum ansible.builtin.include_tasks: load_checksum.yml +- name: Write checksum to kubernetes directory + ansible.builtin.copy: + dest: "{{ kube_patch_checksum_file }}" + content: "{{ patch_checksum }}" + mode: preserve + - name: Write checksum to kubespray directory ansible.builtin.copy: dest: "{{ patch_checksum_file }}" diff --git a/roles/kubespray_patch/tasks/preflight_checksum.yml b/roles/kubespray_patch/tasks/preflight_checksum.yml index a73f939e..52665f97 100644 --- a/roles/kubespray_patch/tasks/preflight_checksum.yml +++ b/roles/kubespray_patch/tasks/preflight_checksum.yml @@ -19,6 +19,7 @@ - name: Load checksum from applied patch set_fact: applied_patch_checksum: "{{ lookup('file', patch_checksum_file, errors='ignore') }}" + applied_kube_patch_checksum: "{{ lookup('file', kube_patch_checksum_file, errors='ignore') }}" - name: Check if checksum of applied patch exists ansible.builtin.assert: @@ -34,3 +35,18 @@ fail_msg: |- Code of kubespray patch checksum is different from currently applied patch checksum. Please re-install the kubespray module and apply new kubespray patch." + +- name: Check if checksum of applied patch exists + ansible.builtin.assert: + that: + - applied_kube_patch_checksum is defined + - applied_kube_patch_checksum + fail_msg: |- + Patch for Kubernetes module is not applied. Please apply patch before running playbooks" + +- name: Compare checksums + ansible.builtin.assert: + that: patch_checksum == applied_kube_patch_checksum + fail_msg: |- + Code of kubernetes module patch checksum is different from currently applied patch checksum. + Please re-install the kubernetes module and apply new patch for kubernetes." diff --git a/roles/kubespray_patch/vars/main.yml b/roles/kubespray_patch/vars/main.yml index 3da59f80..7b885e7e 100644 --- a/roles/kubespray_patch/vars/main.yml +++ b/roles/kubespray_patch/vars/main.yml @@ -15,3 +15,7 @@ ## kubespray_dir: "{{ (role_path, '../..', 'collections/ansible_collections/kubernetes_sigs/kubespray') | path_join }}" patch_checksum_file: "{{ (kubespray_dir, 'ra_patch_checksum') | path_join }}" + +# To fix BUG https://github.com/ansible-collections/kubernetes.core/issues/659 +kubernetes_module_dir: "{{ (role_path, '../..', 'collections/ansible_collections/kubernetes') | path_join }}" +kube_patch_checksum_file: "{{ (kubernetes_module_dir, 'ra_patch_checksum') | path_join }}" diff --git a/roles/kubespray_target_setup/tasks/main.yml b/roles/kubespray_target_setup/tasks/main.yml index 37636454..a2b621e6 100644 --- a/roles/kubespray_target_setup/tasks/main.yml +++ b/roles/kubespray_target_setup/tasks/main.yml @@ -66,4 +66,4 @@ systemd: name: systemd-networkd.service state: restarted - when: kube_network_plugin in ["cilium"] and ansible_distribution == "Ubuntu" and ansible_distribution_version == "22.04" + when: kube_network_plugin in ["cilium"] and ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==') diff --git a/roles/kubevirt_install/defaults/main.yml b/roles/kubevirt_install/defaults/main.yml new file mode 100644 index 00000000..bc0dfdb4 --- /dev/null +++ b/roles/kubevirt_install/defaults/main.yml @@ -0,0 +1,23 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +kubevirt_version: v1.1.0 +kubevirt_base_url: https://github.com/kubevirt/kubevirt/releases + +kubevirt_operator_url: "{{ kubevirt_base_url }}/download/{{ kubevirt_version }}/kubevirt-operator.yaml" +kubevirt_cr_url: "{{ kubevirt_base_url }}/download/{{ kubevirt_version }}/kubevirt-cr.yaml" +kubevirt_namespace: kubevirt + +kubevirt_virtctl_url: "{{ kubevirt_base_url }}/download/{{ kubevirt_version }}/virtctl-{{ kubevirt_version }}-linux-amd64" diff --git a/roles/kubevirt_install/tasks/cleanup.yml b/roles/kubevirt_install/tasks/cleanup.yml new file mode 100644 index 00000000..7433bfb3 --- /dev/null +++ b/roles/kubevirt_install/tasks/cleanup.yml @@ -0,0 +1,38 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: KubeVirt | remove resources + kubernetes.core.k8s: + name: "{{ item.name }}" + kind: "{{ item.kind }}" + namespace: "{{ kubevirt_namespace }}" + state: absent + wait: true + failed_when: false # TODO rework common cleanup procedure + loop: + - { kind: "KubeVirt", name: "kubevirt"} + - { kind: "MutatingWebhookConfiguration", name: "virt-api-mutator"} + - { kind: "ValidatingWebhookConfiguration", name: "virt-operator-validator"} + - { kind: "ValidatingWebhookConfiguration", name: "virt-api-validator"} + when: inventory_hostname == groups['kube_control_plane'][0] + +- name: KubeVirt | remove operator + kubernetes.core.k8s: + src: "{{ kubevirt_operator_url }}" + namespace: "{{ kubevirt_namespace }}" + state: absent + wait: true + failed_when: false # TODO rework common cleanup procedure + when: inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/kubevirt_install/tasks/main.yml b/roles/kubevirt_install/tasks/main.yml new file mode 100644 index 00000000..4a6306c5 --- /dev/null +++ b/roles/kubevirt_install/tasks/main.yml @@ -0,0 +1,46 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: KubeVirt | deploy operator + kubernetes.core.k8s: + src: "{{ kubevirt_operator_url }}" + namespace: "{{ kubevirt_namespace }}" + state: present + +- name: KubeVirt | deploy custom resources + kubernetes.core.k8s: + src: "{{ kubevirt_cr_url }}" + namespace: "{{ kubevirt_namespace }}" + state: present + +- name: KubeVirt | wait until all components are up + kubernetes.core.k8s_info: + namespace: "{{ kubevirt_namespace }}" + kind: KubeVirt + name: kubevirt + wait: true + wait_condition: + type: "Available" + status: "True" + wait_timeout: "240" + +# Installation of virt plugin via krew can be done manually but haven't managed to get it done via ansible +- name: KubeVirt | install virtctl binary to manage VMs + become: true + ansible.builtin.get_url: + url: "{{ kubevirt_virtctl_url }}" + dest: /usr/local/bin/virtctl + mode: '0755' + force: true diff --git a/roles/kubevirt_install/tasks/preflight.yml b/roles/kubevirt_install/tasks/preflight.yml new file mode 100644 index 00000000..a29beff9 --- /dev/null +++ b/roles/kubevirt_install/tasks/preflight.yml @@ -0,0 +1,42 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: Preflight - KubeVirt | Check container runtime + run_once: true + ansible.builtin.assert: + that: container_runtime in ['containerd', 'crio'] + fail_msg: Kubevirt deployment is supported only with containerd or crio CRIs. + +- name: Preflight - KubeVirt | Check HW virtualization enabled + block: + - name: Check Intel Virtualization Technology + ansible.builtin.shell: "set -o pipefail && lscpu | grep 'Virtualization:'" + args: + executable: /bin/bash + register: virtualization_tech + changed_when: false + failed_when: false + + - ansible.builtin.debug: msg="{{ virtualization_tech.stdout }}" + when: + - virtualization_tech|length > 0 + + - name: Warn about Intel Virtualization Technology + ansible.builtin.fail: + msg: | + "Warning: Intel Virtualization Technology is DISABLED on target." + "Please check BIOS under 'Advanced > Processor Configuration' and Enable if necessary" + when: + - "'VT-x' not in virtualization_tech.stdout" diff --git a/roles/linkerd_service_mesh/defaults/main.yml b/roles/linkerd_service_mesh/defaults/main.yml index 1072f402..b908536b 100644 --- a/roles/linkerd_service_mesh/defaults/main.yml +++ b/roles/linkerd_service_mesh/defaults/main.yml @@ -17,13 +17,12 @@ # defaults file for linkerd-cli linkerd_cli_arch: "amd64" linkerd_release: "stable" -linkerd_version: "2.13.6" +linkerd_version: "2.14.6" linkerd_cli_version: "{{ linkerd_version }}" linkerd_cli_uri: "https://github.com/linkerd/linkerd2/releases/download/{{ linkerd_release }}-{{ linkerd_cli_version }}/\ linkerd2-cli-{{ linkerd_release }}-{{ linkerd_cli_version }}-linux-{{ linkerd_cli_arch }}" linkerd_cli_bin: "/usr/local/bin/linkerd" -kubectl_cli_bin: "/usr/local/bin/kubectl" linkerd_namespace: "linkerd" linkerd_helm_repo: "https://helm.linkerd.io/{{ linkerd_release }}" linkerd_helm_values_file: "{{ (project_root_dir, 'linkerd', 'linkerd-control-plane-values.yml') | path_join }}" diff --git a/roles/linkerd_service_mesh/tasks/main.yml b/roles/linkerd_service_mesh/tasks/main.yml index 8b877f23..955bb6f1 100644 --- a/roles/linkerd_service_mesh/tasks/main.yml +++ b/roles/linkerd_service_mesh/tasks/main.yml @@ -54,7 +54,9 @@ wait: true - name: Delete LinkerD Heartbeat CronJob if http_proxy is enabled - ansible.builtin.shell: "set -o pipefail && {{ kubectl_cli_bin }} delete cronjob linkerd-heartbeat -n linkerd" - args: - executable: /bin/bash + kubernetes.core.k8s: + kind: CronJob + name: linkerd-heartbeat + namespace: "{{ linkerd_namespace }}" + state: absent when: http_proxy is defined or https_proxy is defined diff --git a/roles/minio_install/tasks/build_local_awscli_image.yml b/roles/minio_install/tasks/build_local_awscli_image.yml index 55a9437c..3739165f 100644 --- a/roles/minio_install/tasks/build_local_awscli_image.yml +++ b/roles/minio_install/tasks/build_local_awscli_image.yml @@ -55,41 +55,31 @@ dest: "{{ (minio_operator_helm_local_dir, 'tenant', 'temp', 'awscli', 'Dockerfile') | path_join }}" mode: 0644 -- debug: +- ansible.builtin.debug: msg: Container Runtime - "{{ container_runtime }}" # docker is used as container runtime: -- name: prepare containers images - block: - - name: build local AWS custom image - command: >- - docker build -f Dockerfile -t {{ registry_local_address }}/{{ aws_local_build_name }}:{{ aws_image_tag }} ./ - args: - chdir: "{{ aws_local_build_dir }}" - changed_when: true - when: aws_build_image_locally - - - name: push the local AWS custom images to local registry - command: >- - docker push {{ registry_local_address }}/{{ aws_local_build_name }}:{{ aws_image_tag }} - when: aws_build_image_locally +- name: Build and push AWS custom image - docker + community.docker.docker_image: + build: + path: "{{ aws_local_build_dir }}" + dockerfile: Dockerfile + name: "{{ registry_local_address }}/{{ aws_local_build_name }}" + tag: "{{ aws_image_tag }}" + push: true + source: build when: + - aws_build_image_locally - container_runtime == "docker" # containerd/cri-o is used as container runtime: -- name: prepare containers images - block: - - name: build local AWS custom image - command: >- - podman build -f Dockerfile -t {{ registry_local_address }}/{{ aws_local_build_name }}:{{ aws_image_tag }} ./ - args: - chdir: "{{ aws_local_build_dir }}" - changed_when: true - when: aws_build_image_locally - - - name: push the local AWS custom images to local registry - command: >- - podman push {{ registry_local_address }}/{{ aws_local_build_name }}:{{ aws_image_tag }} - when: aws_build_image_locally +- name: Build and push AWS custom image - podman + containers.podman.podman_image: + name: "{{ registry_local_address }}/{{ aws_local_build_name }}" + tag: "{{ aws_image_tag }}" + path: "{{ aws_local_build_dir }}" + push: true + state: build when: + - aws_build_image_locally - container_runtime is in ['containerd', 'crio'] diff --git a/roles/minio_install/tasks/build_local_minio_image.yml b/roles/minio_install/tasks/build_local_minio_image.yml index 1f9ecb6e..ef7e9863 100644 --- a/roles/minio_install/tasks/build_local_minio_image.yml +++ b/roles/minio_install/tasks/build_local_minio_image.yml @@ -50,48 +50,33 @@ unzip \ && rm -rf /var/lib/apt/lists/* -# docker is used as container runtime: -- name: prepare containers images - block: - - name: compile local MinIO - make: - chdir: "{{ minio_local_build_dir }}" - when: minio_build_image_locally - - - name: build local MinIO custom image - command: >- - docker build -f Dockerfile -t {{ registry_local_address }}/{{ minio_local_build_name }}:{{ minio_git_tag }} ./ - args: - chdir: "{{ minio_local_build_dir }}" - changed_when: true - when: minio_build_image_locally +- name: Compile local MinIO + community.general.make: + chdir: "{{ minio_local_build_dir }}" + when: minio_build_image_locally - - name: push the local MinIO custom images to local registry - command: >- - docker push {{ registry_local_address }}/{{ minio_local_build_name }}:{{ minio_git_tag }} - when: minio_build_image_locally +# docker is used as container runtime: +- name: Build and push MinIO custom image - docker + community.docker.docker_image: + build: + path: "{{ minio_local_build_dir }}" + dockerfile: Dockerfile + name: "{{ registry_local_address }}/{{ minio_local_build_name }}" + tag: "{{ minio_git_tag }}" + push: true + source: build when: + - minio_build_image_locally - container_runtime == "docker" # containerd/cri-o is used as container runtime: -- name: prepare containers images - block: - - name: compile local MinIO - make: - chdir: "{{ minio_local_build_dir }}" - when: minio_build_image_locally - - - name: build local MinIO custom image - command: >- - podman build -f Dockerfile -t {{ registry_local_address }}/{{ minio_local_build_name }}:{{ minio_git_tag }} ./ - args: - chdir: "{{ minio_local_build_dir }}" - changed_when: true - when: minio_build_image_locally - - - name: push the local MinIO custom images to local registry - command: >- - podman push {{ registry_local_address }}/{{ minio_local_build_name }}:{{ minio_git_tag }} - when: minio_build_image_locally +- name: Build and push MinIO custom image - podman + containers.podman.podman_image: + name: "{{ registry_local_address }}/{{ minio_local_build_name }}" + tag: "{{ minio_git_tag }}" + path: "{{ minio_local_build_dir }}" + push: true + state: build when: + - minio_build_image_locally - container_runtime is in ['containerd', 'crio'] diff --git a/roles/minio_install/tasks/build_local_postgress_image.yml b/roles/minio_install/tasks/build_local_postgress_image.yml index 1635c296..12223a57 100644 --- a/roles/minio_install/tasks/build_local_postgress_image.yml +++ b/roles/minio_install/tasks/build_local_postgress_image.yml @@ -47,8 +47,10 @@ changed_when: true - name: output huge_pages setting of the container - shell: >- - podman exec -it {{ minio_log_postgres_name }} cat /usr/share/postgresql/postgresql.conf.sample | grep huge_pages + ansible.builtin.shell: + cmd: >- + set -o pipefail && podman exec -it {{ minio_log_postgres_name }} cat /usr/share/postgresql/postgresql.conf.sample | grep huge_pages + executable: /bin/bash changed_when: true - name: commit the container @@ -101,8 +103,10 @@ changed_when: true - name: output huge_hages setting of the container - shell: >- - docker exec -it {{ minio_log_postgres_name }} cat /usr/share/postgresql/postgresql.conf.sample | grep huge_pages + ansible.builtin.shell: + cmd: >- + set -o pipefail && docker exec -it {{ minio_log_postgres_name }} cat /usr/share/postgresql/postgresql.conf.sample | grep huge_pages + executable: /bin/bash changed_when: true - name: commit the container diff --git a/roles/minio_install/tasks/build_minio_variables.yml b/roles/minio_install/tasks/build_minio_variables.yml index 2a374316..ea50f5ee 100644 --- a/roles/minio_install/tasks/build_minio_variables.yml +++ b/roles/minio_install/tasks/build_minio_variables.yml @@ -37,7 +37,7 @@ set_fact: temp_minio_interfaces: "{{ temp_minio_interfaces | default([]) + [item | combine({'nodename': minio_hostname})] }}" vars: - minio_hostname: "{{ inventory_hostname }}" + minio_hostname: "{{ hostvars[inventory_hostname]['ansible_hostname'] }}" loop: "{{ hostvars[inventory_hostname]['dataplane_interfaces'] }}" when: - inventory_hostname in groups['kube_node'] diff --git a/roles/minio_install/tasks/cleanup_minio_main.yml b/roles/minio_install/tasks/cleanup_minio_main.yml index c50171e4..abefd362 100644 --- a/roles/minio_install/tasks/cleanup_minio_main.yml +++ b/roles/minio_install/tasks/cleanup_minio_main.yml @@ -61,18 +61,5 @@ name: minio_install tasks_from: cleanup_minio_whereabouts_helmchart - - name: load MinIO tenant ingress controller variables - include_vars: "{{ item }}" - with_first_found: - - files: - - "main.yml" - paths: - - "{{ (role_path, '..', 'kubernetes_ingress_install', 'defaults') | path_join }}" - - - name: cleanup MinIO tenant ingress controller helmchart - include_role: - name: kubernetes_ingress_install - tasks_from: cleanup_kubernetes_ingress - tags: - minio diff --git a/roles/minio_install/tasks/main.yml b/roles/minio_install/tasks/main.yml index 5ce2d9dd..b1c8cc04 100644 --- a/roles/minio_install/tasks/main.yml +++ b/roles/minio_install/tasks/main.yml @@ -47,12 +47,6 @@ include_role: name: whereabouts_install -- name: install dependencies - kubernetes_ingress - include_role: - name: kubernetes_ingress_install - when: - minio_ingress_enabled | default(false) - - name: clone MinIO operator/console/tenant include_tasks: clone_minio_operator.yml when: diff --git a/roles/minio_install/tasks/preflight_minio_config.yml b/roles/minio_install/tasks/preflight_minio_config.yml index 91e212be..511a0e11 100644 --- a/roles/minio_install/tasks/preflight_minio_config.yml +++ b/roles/minio_install/tasks/preflight_minio_config.yml @@ -18,7 +18,7 @@ block: - name: check MinIO minimum number of nodes assert: - that: "{{ minio_tenant_servers | int }} <= {{ groups['kube_node'] | length | int }}" + that: (minio_tenant_servers | int) <= (groups['kube_node'] | length | int) msg: | "Incorrect configuration." "The number of MinIO tenant servers '{{ minio_tenant_servers | int }}' defined in group vars must be" @@ -26,7 +26,7 @@ - name: make sure the MinIO tenant volumes per server >= the MiniO PV list assert: - that: "{{ persistent_volumes | length | int }} >= {{ minio_tenant_volumes_per_server | int }}" + that: (persistent_volumes | length | int) >= (minio_tenant_volumes_per_server | int) msg: - "Incorrect configuration." - "The number of MinIO Persistent Volumes (PVs) '{{ persistent_volumes | length | int }}' defined in the host vars must be" diff --git a/roles/minio_install/templates/minio_tenant_localpersistentvolumes.yml.j2 b/roles/minio_install/templates/minio_tenant_localpersistentvolumes.yml.j2 index 9e7f14c1..d193e51b 100644 --- a/roles/minio_install/templates/minio_tenant_localpersistentvolumes.yml.j2 +++ b/roles/minio_install/templates/minio_tenant_localpersistentvolumes.yml.j2 @@ -21,5 +21,5 @@ spec: - key: kubernetes.io/hostname operator: In values: - - {{ item }} + - {{ hostvars[item]['ansible_hostname'] }} {% endfor %} diff --git a/roles/multus_service/defaults/main.yml b/roles/multus_service/defaults/main.yml index 416d1658..e2907b8a 100755 --- a/roles/multus_service/defaults/main.yml +++ b/roles/multus_service/defaults/main.yml @@ -18,7 +18,7 @@ multus_service_namespace: "kube-system" # Multus Service namespace multus_service_release_name: "multus-service" # Multus Service Helm Charts release name multus_service_image: ghcr.io/k8snetworkplumbingwg/multus-service -multus_service_digest: sha256:f53a6fcf3f728bec8fc6ceb1a6e5ad0ee0cc912ceb3c6610a3c8a468cb2736b9 +multus_service_digest: sha256:3d825327b9851d6045448d7db5323689d1eb4ae813eeac26ed8fcf5ee8d194fb # Placeholder to upstream where deployment k8s template is taken from # multus_service_git_url: "https://github.com/k8snetworkplumbingwg/multus-service.git" diff --git a/roles/net_attach_defs_create/templates/userspace-vpp.j2 b/roles/net_attach_defs_create/templates/userspace-vpp.j2 index 2910b6fe..cf23ab58 100644 --- a/roles/net_attach_defs_create/templates/userspace-vpp.j2 +++ b/roles/net_attach_defs_create/templates/userspace-vpp.j2 @@ -25,7 +25,7 @@ spec: "mode": "ethernet" }, "bridge": { - "bridgeId": 0 + "bridgeName": "4" } }, "container": { diff --git a/roles/nfd_install/charts/node-feature-discovery/.helmignore b/roles/nfd_install/charts/node-feature-discovery/.helmignore deleted file mode 100644 index 0e8a0eb3..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/roles/nfd_install/charts/node-feature-discovery/Chart.yaml b/roles/nfd_install/charts/node-feature-discovery/Chart.yaml deleted file mode 100644 index f4808c68..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/Chart.yaml +++ /dev/null @@ -1,30 +0,0 @@ -## -## Copyright (c) 2020-2023 Intel Corporation. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. -## -apiVersion: v2 -appVersion: "version_placeholder" -description: | - Detects hardware features available on each node in a Kubernetes cluster, and advertises - those features using node labels. -name: node-feature-discovery -sources: - - https://github.com/kubernetes-sigs/node-feature-discovery -home: https://github.com/kubernetes-sigs/node-feature-discovery -keywords: - - feature-discovery - - feature-detection - - node-labels -type: application -version: 0.2.1 diff --git a/roles/nfd_install/charts/node-feature-discovery/README.md b/roles/nfd_install/charts/node-feature-discovery/README.md deleted file mode 100644 index 33be2e37..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/README.md +++ /dev/null @@ -1,25 +0,0 @@ - -# Node Feature Discovery - -Node Feature Discovery (NFD) is a Kubernetes add-on for detecting hardware -features and system configuration. Detected features are advertised as node -labels. NFD provides flexible configuration and extension points for a wide -range of vendor and application specific node labeling needs. - -See -[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.12/get-started/deployment-and-usage.html#deployment-with-helm) -for deployment instructions. diff --git a/roles/nfd_install/charts/node-feature-discovery/crds/nfd-api-crds.yaml b/roles/nfd_install/charts/node-feature-discovery/crds/nfd-api-crds.yaml deleted file mode 100644 index 99f6d431..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/crds/nfd-api-crds.yaml +++ /dev/null @@ -1,378 +0,0 @@ -## -## Copyright (c) 2020-2023 Intel Corporation. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. -## ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.11.3 - creationTimestamp: null - name: nodefeatures.nfd.k8s-sigs.io -spec: - group: nfd.k8s-sigs.io - names: - kind: NodeFeature - listKind: NodeFeatureList - plural: nodefeatures - singular: nodefeature - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: NodeFeature resource holds the features discovered for one node - in the cluster. - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: NodeFeatureSpec describes a NodeFeature object. - properties: - features: - description: Features is the full "raw" features data that has been - discovered. - properties: - attributes: - additionalProperties: - description: AttributeFeatureSet is a set of features having - string value. - properties: - elements: - additionalProperties: - type: string - type: object - required: - - elements - type: object - description: Attributes contains all the attribute-type features - of the node. - type: object - flags: - additionalProperties: - description: FlagFeatureSet is a set of simple features only - containing names without values. - properties: - elements: - additionalProperties: - description: Nil is a dummy empty struct for protobuf - compatibility - type: object - type: object - required: - - elements - type: object - description: Flags contains all the flag-type features of the - node. - type: object - instances: - additionalProperties: - description: InstanceFeatureSet is a set of features each of - which is an instance having multiple attributes. - properties: - elements: - items: - description: InstanceFeature represents one instance of - a complex features, e.g. a device. - properties: - attributes: - additionalProperties: - type: string - type: object - required: - - attributes - type: object - type: array - required: - - elements - type: object - description: Instances contains all the instance-type features - of the node. - type: object - type: object - labels: - additionalProperties: - type: string - description: Labels is the set of node labels that are requested to - be created. - type: object - type: object - required: - - spec - type: object - served: true - storage: true ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.11.3 - creationTimestamp: null - name: nodefeaturerules.nfd.k8s-sigs.io -spec: - group: nfd.k8s-sigs.io - names: - kind: NodeFeatureRule - listKind: NodeFeatureRuleList - plural: nodefeaturerules - shortNames: - - nfr - singular: nodefeaturerule - scope: Cluster - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: NodeFeatureRule resource specifies a configuration for feature-based - customization of node objects, such as node labeling. - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: NodeFeatureRuleSpec describes a NodeFeatureRule. - properties: - rules: - description: Rules is a list of node customization rules. - items: - description: Rule defines a rule for node customization such as - labeling. - properties: - extendedResources: - additionalProperties: - type: string - description: ExtendedResources to create if the rule matches. - type: object - labels: - additionalProperties: - type: string - description: Labels to create if the rule matches. - type: object - labelsTemplate: - description: LabelsTemplate specifies a template to expand for - dynamically generating multiple labels. Data (after template - expansion) must be keys with an optional value ([=]) - separated by newlines. - type: string - matchAny: - description: MatchAny specifies a list of matchers one of which - must match. - items: - description: MatchAnyElem specifies one sub-matcher of MatchAny. - properties: - matchFeatures: - description: MatchFeatures specifies a set of matcher - terms all of which must match. - items: - description: FeatureMatcherTerm defines requirements - against one feature set. All requirements (specified - as MatchExpressions) are evaluated against each element - in the feature set. - properties: - feature: - type: string - matchExpressions: - additionalProperties: - description: "MatchExpression specifies an expression - to evaluate against a set of input values. It - contains an operator that is applied when matching - the input and an array of values that the operator - evaluates the input against. \n NB: CreateMatchExpression - or MustCreateMatchExpression() should be used - for creating new instances. \n NB: Validate() - must be called if Op or Value fields are modified - or if a new instance is created from scratch - without using the helper functions." - properties: - op: - description: Op is the operator to be applied. - enum: - - In - - NotIn - - InRegexp - - Exists - - DoesNotExist - - Gt - - Lt - - GtLt - - IsTrue - - IsFalse - type: string - value: - description: Value is the list of values that - the operand evaluates the input against. - Value should be empty if the operator is - Exists, DoesNotExist, IsTrue or IsFalse. - Value should contain exactly one element - if the operator is Gt or Lt and exactly - two elements if the operator is GtLt. In - other cases Value should contain at least - one element. - items: - type: string - type: array - required: - - op - type: object - description: MatchExpressionSet contains a set of - MatchExpressions, each of which is evaluated against - a set of input values. - type: object - required: - - feature - - matchExpressions - type: object - type: array - required: - - matchFeatures - type: object - type: array - matchFeatures: - description: MatchFeatures specifies a set of matcher terms - all of which must match. - items: - description: FeatureMatcherTerm defines requirements against - one feature set. All requirements (specified as MatchExpressions) - are evaluated against each element in the feature set. - properties: - feature: - type: string - matchExpressions: - additionalProperties: - description: "MatchExpression specifies an expression - to evaluate against a set of input values. It contains - an operator that is applied when matching the input - and an array of values that the operator evaluates - the input against. \n NB: CreateMatchExpression or - MustCreateMatchExpression() should be used for creating - new instances. \n NB: Validate() must be called if - Op or Value fields are modified or if a new instance - is created from scratch without using the helper functions." - properties: - op: - description: Op is the operator to be applied. - enum: - - In - - NotIn - - InRegexp - - Exists - - DoesNotExist - - Gt - - Lt - - GtLt - - IsTrue - - IsFalse - type: string - value: - description: Value is the list of values that the - operand evaluates the input against. Value should - be empty if the operator is Exists, DoesNotExist, - IsTrue or IsFalse. Value should contain exactly - one element if the operator is Gt or Lt and exactly - two elements if the operator is GtLt. In other - cases Value should contain at least one element. - items: - type: string - type: array - required: - - op - type: object - description: MatchExpressionSet contains a set of MatchExpressions, - each of which is evaluated against a set of input values. - type: object - required: - - feature - - matchExpressions - type: object - type: array - name: - description: Name of the rule. - type: string - taints: - description: Taints to create if the rule matches. - items: - description: The node this Taint is attached to has the "effect" - on any pod that does not tolerate the Taint. - properties: - effect: - description: Required. The effect of the taint on pods - that do not tolerate the taint. Valid effects are NoSchedule, - PreferNoSchedule and NoExecute. - type: string - key: - description: Required. The taint key to be applied to - a node. - type: string - timeAdded: - description: TimeAdded represents the time at which the - taint was added. It is only written for NoExecute taints. - format: date-time - type: string - value: - description: The taint value corresponding to the taint - key. - type: string - required: - - effect - - key - type: object - type: array - vars: - additionalProperties: - type: string - description: Vars is the variables to store if the rule matches. - Variables do not directly inflict any changes in the node - object. However, they can be referenced from other rules enabling - more complex rule hierarchies, without exposing intermediary - output values as labels. - type: object - varsTemplate: - description: VarsTemplate specifies a template to expand for - dynamically generating multiple variables. Data (after template - expansion) must be keys with an optional value ([=]) - separated by newlines. - type: string - required: - - name - type: object - type: array - required: - - rules - type: object - required: - - spec - type: object - served: true - storage: true diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/_helpers.tpl b/roles/nfd_install/charts/node-feature-discovery/templates/_helpers.tpl deleted file mode 100644 index 39c1e3df..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/_helpers.tpl +++ /dev/null @@ -1,96 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "node-feature-discovery.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "node-feature-discovery.fullname" -}} -{{- if .Values.fullnameOverride -}} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- if contains $name .Release.Name -}} -{{- .Release.Name | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} -{{- end -}} -{{- end -}} - -{{/* -Allow the release namespace to be overridden for multi-namespace deployments in combined charts -*/}} -{{- define "node-feature-discovery.namespace" -}} - {{- if .Values.namespaceOverride -}} - {{- .Values.namespaceOverride -}} - {{- else -}} - {{- .Release.Namespace -}} - {{- end -}} -{{- end -}} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "node-feature-discovery.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Common labels -*/}} -{{- define "node-feature-discovery.labels" -}} -helm.sh/chart: {{ include "node-feature-discovery.chart" . }} -{{ include "node-feature-discovery.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end -}} - -{{/* -Selector labels -*/}} -{{- define "node-feature-discovery.selectorLabels" -}} -app.kubernetes.io/name: {{ include "node-feature-discovery.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end -}} - -{{/* -Create the name of the service account which the nfd master will use -*/}} -{{- define "node-feature-discovery.master.serviceAccountName" -}} -{{- if .Values.master.serviceAccount.create -}} - {{ default (include "node-feature-discovery.fullname" .) .Values.master.serviceAccount.name }} -{{- else -}} - {{ default "default" .Values.master.serviceAccount.name }} -{{- end -}} -{{- end -}} - -{{/* -Create the name of the service account which the nfd worker will use -*/}} -{{- define "node-feature-discovery.worker.serviceAccountName" -}} -{{- if .Values.worker.serviceAccount.create -}} - {{ default (printf "%s-worker" (include "node-feature-discovery.fullname" .)) .Values.worker.serviceAccount.name }} -{{- else -}} - {{ default "default" .Values.worker.serviceAccount.name }} -{{- end -}} -{{- end -}} - -{{/* -Create the name of the service account which topologyUpdater will use -*/}} -{{- define "node-feature-discovery.topologyUpdater.serviceAccountName" -}} -{{- if .Values.topologyUpdater.serviceAccount.create -}} - {{ default (printf "%s-topology-updater" (include "node-feature-discovery.fullname" .)) .Values.topologyUpdater.serviceAccount.name }} -{{- else -}} - {{ default "default" .Values.topologyUpdater.serviceAccount.name }} -{{- end -}} -{{- end -}} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/cert-manager-certs.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/cert-manager-certs.yaml deleted file mode 100644 index ac2e51fc..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/cert-manager-certs.yaml +++ /dev/null @@ -1,67 +0,0 @@ -{{- if .Values.tls.certManager }} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-master-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - secretName: nfd-master-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-master - dnsNames: - # must match the service name - - {{ include "node-feature-discovery.fullname" . }}-master - # first one is configured for use by the worker; below are for completeness - - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc - - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local - # localhost needed for grpc_health_probe - - localhost - issuerRef: - name: nfd-ca-issuer - kind: Issuer - group: cert-manager.io - ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-worker-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - secretName: nfd-worker-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-worker - dnsNames: - - {{ include "node-feature-discovery.fullname" . }}-worker.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local - issuerRef: - name: nfd-ca-issuer - kind: Issuer - group: cert-manager.io - -{{- if .Values.topologyUpdater.enable }} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-topology-updater-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - secretName: nfd-topology-updater-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-topology-updater - dnsNames: - - {{ include "node-feature-discovery.fullname" . }}-topology-updater.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local - issuerRef: - name: nfd-ca-issuer - kind: Issuer - group: cert-manager.io -{{- end }} - -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/cert-manager-issuer.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/cert-manager-issuer.yaml deleted file mode 100644 index f3c57ace..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/cert-manager-issuer.yaml +++ /dev/null @@ -1,42 +0,0 @@ -{{- if .Values.tls.certManager }} -# See https://cert-manager.io/docs/configuration/selfsigned/#bootstrapping-ca-issuers -# - Create a self signed issuer -# - Use this to create a CA cert -# - Use this to now create a CA issuer ---- -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: nfd-ca-bootstrap - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - selfSigned: {} - ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-ca-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - isCA: true - secretName: nfd-ca-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-ca-cert - issuerRef: - name: nfd-ca-bootstrap - kind: Issuer - group: cert-manager.io - ---- -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: nfd-ca-issuer - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - ca: - secretName: nfd-ca-cert -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/clusterrole.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/clusterrole.yaml deleted file mode 100644 index 9e75927e..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/clusterrole.yaml +++ /dev/null @@ -1,97 +0,0 @@ -{{- if .Values.master.rbac.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "node-feature-discovery.fullname" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -rules: -- apiGroups: - - "" - resources: - - nodes - - nodes/status - verbs: - - get - - patch - - update - - list -- apiGroups: - - nfd.k8s-sigs.io - resources: - - nodefeatures - - nodefeaturerules - verbs: - - get - - list - - watch -{{- end }} - ---- -{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-updater - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -rules: -- apiGroups: - - "" - resources: - - nodes - verbs: - - get - - list -- apiGroups: - - "" - resources: - - nodes/proxy - verbs: - - get -- apiGroups: - - "" - resources: - - pods - verbs: - - get -- apiGroups: - - topology.node.k8s.io - resources: - - noderesourcetopologies - verbs: - - create - - get - - update -{{- end }} - ---- -{{- if and .Values.topologyGC.enable .Values.topologyGC.rbac.create .Values.topologyUpdater.enable }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -rules: -- apiGroups: - - "" - resources: - - nodes - verbs: - - list - - watch -- apiGroups: - - "" - resources: - - nodes/proxy - verbs: - - get -- apiGroups: - - topology.node.k8s.io - resources: - - noderesourcetopologies - verbs: - - delete - - list -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/clusterrolebinding.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/clusterrolebinding.yaml deleted file mode 100644 index 227bce5e..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/clusterrolebinding.yaml +++ /dev/null @@ -1,52 +0,0 @@ -{{- if .Values.master.rbac.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "node-feature-discovery.fullname" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "node-feature-discovery.fullname" . }} -subjects: -- kind: ServiceAccount - name: {{ include "node-feature-discovery.master.serviceAccountName" . }} - namespace: {{ include "node-feature-discovery.namespace" . }} -{{- end }} - ---- -{{- if .Values.topologyUpdater.rbac.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-updater - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "node-feature-discovery.fullname" . }}-topology-updater -subjects: -- kind: ServiceAccount - name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} - namespace: {{ include "node-feature-discovery.namespace" . }} -{{- end }} - ---- -{{- if and .Values.topologyGC.enable .Values.topologyGC.rbac.create .Values.topologyUpdater.enable }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc -subjects: -- kind: ServiceAccount - name: {{ .Values.topologyGC.serviceAccount.name | default "nfd-topology-gc" }} - namespace: {{ include "node-feature-discovery.namespace" . }} -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/master.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/master.yaml deleted file mode 100644 index 264e3bb7..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/master.yaml +++ /dev/null @@ -1,148 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-master - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: master - annotations: - {{- toYaml .Values.master.deploymentAnnotations | nindent 4 }} -spec: - replicas: {{ .Values.master.replicaCount }} - selector: - matchLabels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} - role: master - template: - metadata: - labels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} - role: master - annotations: - {{- toYaml .Values.master.annotations | nindent 8 }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "node-feature-discovery.master.serviceAccountName" . }} - enableServiceLinks: false - securityContext: - {{- toYaml .Values.master.podSecurityContext | nindent 8 }} - containers: - - name: master - securityContext: - {{- toYaml .Values.master.securityContext | nindent 12 }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - livenessProbe: - exec: - command: - - "/usr/bin/grpc_health_probe" - - "-addr=:{{ .Values.master.port | default "8080" }}" - {{- if .Values.tls.enable }} - - "-tls" - - "-tls-ca-cert=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-tls-client-key=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-tls-client-cert=/etc/kubernetes/node-feature-discovery/certs/tls.crt" - {{- end }} - initialDelaySeconds: 10 - periodSeconds: 10 - readinessProbe: - exec: - command: - - "/usr/bin/grpc_health_probe" - - "-addr=:{{ .Values.master.port | default "8080" }}" - {{- if .Values.tls.enable }} - - "-tls" - - "-tls-ca-cert=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-tls-client-key=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-tls-client-cert=/etc/kubernetes/node-feature-discovery/certs/tls.crt" - {{- end }} - initialDelaySeconds: 5 - periodSeconds: 10 - failureThreshold: 10 - ports: - - containerPort: {{ .Values.master.port | default "8080" }} - name: grpc - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - command: - - "nfd-master" - resources: - {{- toYaml .Values.master.resources | nindent 12 }} - args: - {{- if .Values.master.instance | empty | not }} - - "-instance={{ .Values.master.instance }}" - {{- end }} - - "-port={{ .Values.master.port | default "8080" }}" - {{- if .Values.enableNodeFeatureApi }} - - "-enable-nodefeature-api" - {{- end }} - {{- if .Values.master.extraLabelNs | empty | not }} - - "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}" - {{- end }} - {{- if .Values.master.denyLabelNs | empty | not }} - - "-deny-label-ns={{- join "," .Values.master.denyLabelNs }}" - {{- end }} - {{- if .Values.master.resourceLabels | empty | not }} - - "-resource-labels={{- join "," .Values.master.resourceLabels }}" - {{- end }} - {{- if .Values.master.enableTaints }} - - "-enable-taints" - {{- end }} - {{- if .Values.master.crdController | kindIs "invalid" | not }} - - "-crd-controller={{ .Values.master.crdController }}" - {{- else }} - ## By default, disable crd controller for other than the default instances - - "-featurerules-controller={{ .Values.master.instance | empty }}" - {{- end }} - {{- if .Values.master.featureRulesController | kindIs "invalid" | not }} - - "-featurerules-controller={{ .Values.master.featureRulesController }}" - {{- end }} - {{- if .Values.master.resyncPeriod }} - - "-resync-period={{ .Values.master.resyncPeriod }}" - {{- end }} - {{- if .Values.tls.enable }} - - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" - {{- end }} - volumeMounts: - {{- if .Values.tls.enable }} - - name: nfd-master-cert - mountPath: "/etc/kubernetes/node-feature-discovery/certs" - readOnly: true - {{- end }} - - name: nfd-master-conf - mountPath: "/etc/kubernetes/node-feature-discovery" - readOnly: true - volumes: - {{- if .Values.tls.enable }} - - name: nfd-master-cert - secret: - secretName: nfd-master-cert - {{- end }} - - name: nfd-master-conf - configMap: - name: {{ include "node-feature-discovery.fullname" . }}-master-conf - items: - - key: nfd-master.conf - path: nfd-master.conf - - {{- with .Values.master.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.master.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.master.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/nfd-master-conf.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/nfd-master-conf.yaml deleted file mode 100644 index c806a8e5..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/nfd-master-conf.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-master-conf - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -data: - nfd-master.conf: |- - {{- .Values.master.config | toYaml | nindent 4 }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml deleted file mode 100644 index 9867f508..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -data: - nfd-topology-updater.conf: |- - {{- .Values.topologyUpdater.config | toYaml | nindent 4 }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/nfd-worker-conf.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/nfd-worker-conf.yaml deleted file mode 100644 index 61d2a481..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/nfd-worker-conf.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-worker-conf - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -data: - nfd-worker.conf: |- - {{- .Values.worker.config | toYaml | nindent 4 }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/role.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/role.yaml deleted file mode 100644 index f63cb8ff..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/role.yaml +++ /dev/null @@ -1,18 +0,0 @@ -{{- if .Values.worker.rbac.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-worker - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -rules: -- apiGroups: - - nfd.k8s-sigs.io - resources: - - nodefeatures - verbs: - - create - - get - - update -{{- end }} - diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/rolebinding.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/rolebinding.yaml deleted file mode 100644 index 30a00381..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/rolebinding.yaml +++ /dev/null @@ -1,17 +0,0 @@ -{{- if .Values.worker.rbac.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-worker - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ include "node-feature-discovery.fullname" . }}-worker -subjects: -- kind: ServiceAccount - name: {{ include "node-feature-discovery.worker.serviceAccountName" . }} - namespace: {{ include "node-feature-discovery.namespace" . }} -{{- end }} - diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/service.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/service.yaml deleted file mode 100644 index 0d478981..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-master - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: master -spec: - type: {{ .Values.master.service.type }} - ports: - - port: {{ .Values.master.service.port | default "8080" }} - targetPort: grpc - protocol: TCP - name: grpc - selector: - {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} - role: master diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/serviceaccount.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/serviceaccount.yaml deleted file mode 100644 index 022961e4..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/serviceaccount.yaml +++ /dev/null @@ -1,58 +0,0 @@ -{{- if .Values.master.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "node-feature-discovery.master.serviceAccountName" . }} - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - {{- with .Values.master.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -{{- end }} - ---- -{{- if .Values.topologyUpdater.serviceAccount.create }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - {{- with .Values.topologyUpdater.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -{{- end }} - ---- -{{- if and .Values.topologyGC.enable .Values.topologyGC.serviceAccount.create .Values.topologyUpdater.enable }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ .Values.topologyGC.serviceAccount.name | default "nfd-topology-gc" }} - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - {{- with .Values.topologyUpdater.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -{{- end }} - ---- -{{- if .Values.worker.serviceAccount.create }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "node-feature-discovery.worker.serviceAccountName" . }} - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - {{- with .Values.worker.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/topology-gc.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/topology-gc.yaml deleted file mode 100644 index 642fec45..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/topology-gc.yaml +++ /dev/null @@ -1,64 +0,0 @@ -{{- if and .Values.topologyGC.enable .Values.topologyUpdater.enable -}} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-gc - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: topology-gc -spec: - replicas: {{ .Values.topologyGC.replicaCount | default 1 }} - selector: - matchLabels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} - role: topology-gc - template: - metadata: - labels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} - role: topology-gc - annotations: - {{- toYaml .Values.topologyGC.annotations | nindent 8 }} - spec: - serviceAccountName: {{ .Values.topologyGC.serviceAccountName | default "nfd-topology-gc" }} - dnsPolicy: ClusterFirstWithHostNet - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - {{- toYaml .Values.topologyGC.podSecurityContext | nindent 8 }} - containers: - - name: topology-gc - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: "{{ .Values.image.pullPolicy }}" - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - command: - - "nfd-topology-gc" - args: - {{- if .Values.topologyGC.interval | empty | not }} - - "-gc-interval={{ .Values.topologyGC.interval }}" - {{- end }} - resources: - {{- toYaml .Values.topologyGC.resources | nindent 12 }} - securityContext: - {{- toYaml .Values.topologyGC.securityContext | nindent 12 }} - - {{- with .Values.topologyGC.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.topologyGC.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.topologyGC.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/topologyupdater-crds.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/topologyupdater-crds.yaml deleted file mode 100644 index f5a1c6e3..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/topologyupdater-crds.yaml +++ /dev/null @@ -1,278 +0,0 @@ -{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.createCRDs -}} -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - api-approved.kubernetes.io: https://github.com/kubernetes/enhancements/pull/1870 - controller-gen.kubebuilder.io/version: v0.11.2 - creationTimestamp: null - name: noderesourcetopologies.topology.node.k8s.io -spec: - group: topology.node.k8s.io - names: - kind: NodeResourceTopology - listKind: NodeResourceTopologyList - plural: noderesourcetopologies - shortNames: - - node-res-topo - singular: noderesourcetopology - scope: Cluster - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: NodeResourceTopology describes node resources and their topology. - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - topologyPolicies: - items: - type: string - type: array - zones: - description: ZoneList contains an array of Zone objects. - items: - description: Zone represents a resource topology zone, e.g. socket, - node, die or core. - properties: - attributes: - description: AttributeList contains an array of AttributeInfo objects. - items: - description: AttributeInfo contains one attribute of a Zone. - properties: - name: - type: string - value: - type: string - required: - - name - - value - type: object - type: array - costs: - description: CostList contains an array of CostInfo objects. - items: - description: CostInfo describes the cost (or distance) between - two Zones. - properties: - name: - type: string - value: - format: int64 - type: integer - required: - - name - - value - type: object - type: array - name: - type: string - parent: - type: string - resources: - description: ResourceInfoList contains an array of ResourceInfo - objects. - items: - description: ResourceInfo contains information about one resource - type. - properties: - allocatable: - anyOf: - - type: integer - - type: string - description: Allocatable quantity of the resource, corresponding - to allocatable in node status, i.e. total amount of this - resource available to be used by pods. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - available: - anyOf: - - type: integer - - type: string - description: Available is the amount of this resource currently - available for new (to be scheduled) pods, i.e. Allocatable - minus the resources reserved by currently running pods. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - capacity: - anyOf: - - type: integer - - type: string - description: Capacity of the resource, corresponding to capacity - in node status, i.e. total amount of this resource that - the node has. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - name: - description: Name of the resource. - type: string - required: - - allocatable - - available - - capacity - - name - type: object - type: array - type: - type: string - required: - - name - - type - type: object - type: array - required: - - topologyPolicies - - zones - type: object - served: true -storage: false - - name: v1alpha2 - schema: - openAPIV3Schema: - description: NodeResourceTopology describes node resources and their topology. - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - attributes: - description: AttributeList contains an array of AttributeInfo objects. - items: - description: AttributeInfo contains one attribute of a Zone. - properties: - name: - type: string - value: - type: string - required: - - name - - value - type: object - type: array - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - topologyPolicies: - description: 'DEPRECATED (to be removed in v1beta1): use top level attributes - if needed' - items: - type: string - type: array - zones: - description: ZoneList contains an array of Zone objects. - items: - description: Zone represents a resource topology zone, e.g. socket, - node, die or core. - properties: - attributes: - description: AttributeList contains an array of AttributeInfo objects. - items: - description: AttributeInfo contains one attribute of a Zone. - properties: - name: - type: string - value: - type: string - required: - - name - - value - type: object - type: array - costs: - description: CostList contains an array of CostInfo objects. - items: - description: CostInfo describes the cost (or distance) between - two Zones. - properties: - name: - type: string - value: - format: int64 - type: integer - required: - - name - - value - type: object - type: array - name: - type: string - parent: - type: string - resources: - description: ResourceInfoList contains an array of ResourceInfo - objects. - items: - description: ResourceInfo contains information about one resource - type. - properties: - allocatable: - anyOf: - - type: integer - - type: string - description: Allocatable quantity of the resource, corresponding - to allocatable in node status, i.e. total amount of this - resource available to be used by pods. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - available: - anyOf: - - type: integer - - type: string - description: Available is the amount of this resource currently - available for new (to be scheduled) pods, i.e. Allocatable - minus the resources reserved by currently running pods. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - capacity: - anyOf: - - type: integer - - type: string - description: Capacity of the resource, corresponding to capacity - in node status, i.e. total amount of this resource that - the node has. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - name: - description: Name of the resource. - type: string - required: - - allocatable - - available - - capacity - - name - type: object - type: array - type: - type: string - required: - - name - - type - type: object - type: array - required: - - zones - type: object - served: true - storage: true -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/topologyupdater.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/topologyupdater.yaml deleted file mode 100644 index 4963a52b..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/topologyupdater.yaml +++ /dev/null @@ -1,137 +0,0 @@ -{{- if .Values.topologyUpdater.enable -}} -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-topology-updater - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: topology-updater -spec: - selector: - matchLabels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} - role: topology-updater - template: - metadata: - labels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} - role: topology-updater - annotations: - {{- toYaml .Values.topologyUpdater.annotations | nindent 8 }} - spec: - serviceAccountName: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} - dnsPolicy: ClusterFirstWithHostNet - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - {{- toYaml .Values.topologyUpdater.podSecurityContext | nindent 8 }} - containers: - - name: topology-updater - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: "{{ .Values.image.pullPolicy }}" - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - command: - - "nfd-topology-updater" - args: - {{- if .Values.topologyUpdater.updateInterval | empty | not }} - - "-sleep-interval={{ .Values.topologyUpdater.updateInterval }}" - {{- else }} - - "-sleep-interval=3s" - {{- end }} - {{- if .Values.topologyUpdater.watchNamespace | empty | not }} - - "-watch-namespace={{ .Values.topologyUpdater.watchNamespace }}" - {{- else }} - - "-watch-namespace=*" - {{- end }} - {{- if .Values.tls.enable }} - - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" - {{- end }} - {{- if .Values.topologyUpdater.podSetFingerprint }} - - "-pods-fingerprint" - {{- end }} - {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} - - "-kubelet-config-uri=file:///host-var/kubelet-config" - {{- end }} - volumeMounts: - {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} - - name: kubelet-config - mountPath: /host-var/kubelet-config - {{- end }} - - name: kubelet-podresources-sock - mountPath: /host-var/lib/kubelet/pod-resources/kubelet.sock - - name: host-sys - mountPath: /host-sys - {{- if .Values.topologyUpdater.kubeletStateDir | empty | not }} - - name: kubelet-state-files - mountPath: /host-var/lib/kubelet - readOnly: true - {{- end }} - {{- if .Values.tls.enable }} - - name: nfd-topology-updater-cert - mountPath: "/etc/kubernetes/node-feature-discovery/certs" - readOnly: true - {{- end }} - - name: nfd-topology-updater-conf - mountPath: "/etc/kubernetes/node-feature-discovery" - readOnly: true - - resources: - {{- toYaml .Values.topologyUpdater.resources | nindent 12 }} - securityContext: - {{- toYaml .Values.topologyUpdater.securityContext | nindent 12 }} - volumes: - - name: host-sys - hostPath: - path: "/sys" - {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} - - name: kubelet-config - hostPath: - path: {{ .Values.topologyUpdater.kubeletConfigPath }} - {{- end }} - - name: kubelet-podresources-sock - hostPath: - {{- if .Values.topologyUpdater.kubeletPodResourcesSockPath | empty | not }} - path: {{ .Values.topologyUpdater.kubeletPodResourcesSockPath }} - {{- else }} - path: /var/lib/kubelet/pod-resources/kubelet.sock - {{- end }} - {{- if .Values.topologyUpdater.kubeletStateDir | empty | not }} - - name: kubelet-state-files - hostPath: - path: {{ .Values.topologyUpdater.kubeletStateDir }} - {{- end }} - - name: nfd-topology-updater-conf - configMap: - name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf - items: - - key: nfd-topology-updater.conf - path: nfd-topology-updater.conf - {{- if .Values.tls.enable }} - - name: nfd-topology-updater-cert - secret: - secretName: nfd-topology-updater-cert - {{- end }} - - - {{- with .Values.topologyUpdater.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.topologyUpdater.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.topologyUpdater.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} -{{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/templates/worker.yaml b/roles/nfd_install/charts/node-feature-discovery/templates/worker.yaml deleted file mode 100644 index c1240bdc..00000000 --- a/roles/nfd_install/charts/node-feature-discovery/templates/worker.yaml +++ /dev/null @@ -1,144 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-worker - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: worker - annotations: - {{- toYaml .Values.worker.daemonsetAnnotations | nindent 4 }} -spec: - selector: - matchLabels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} - role: worker - template: - metadata: - labels: - {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} - role: worker - annotations: - {{- toYaml .Values.worker.annotations | nindent 8 }} - spec: - dnsPolicy: ClusterFirstWithHostNet - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "node-feature-discovery.worker.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.worker.podSecurityContext | nindent 8 }} - containers: - - name: worker - securityContext: - {{- toYaml .Values.worker.securityContext | nindent 12 }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - resources: - {{- toYaml .Values.worker.resources | nindent 12 }} - command: - - "nfd-worker" - args: - - "-server={{ include "node-feature-discovery.fullname" . }}-master:{{ .Values.master.service.port }}" - {{- if .Values.enableNodeFeatureApi }} - - "-enable-nodefeature-api" - {{- end }} -{{- if .Values.tls.enable }} - - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" -{{- end }} - volumeMounts: - - name: host-boot - mountPath: "/host-boot" - readOnly: true - - name: host-os-release - mountPath: "/host-etc/os-release" - readOnly: true - - name: host-sys - mountPath: "/host-sys" - readOnly: true - - name: host-usr-lib - mountPath: "/host-usr/lib" - readOnly: true - - name: host-lib - mountPath: "/host-lib" - readOnly: true - {{- if .Values.worker.mountUsrSrc }} - - name: host-usr-src - mountPath: "/host-usr/src" - readOnly: true - {{- end }} - - name: source-d - mountPath: "/etc/kubernetes/node-feature-discovery/source.d/" - readOnly: true - - name: features-d - mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" - readOnly: true - - name: nfd-worker-conf - mountPath: "/etc/kubernetes/node-feature-discovery" - readOnly: true -{{- if .Values.tls.enable }} - - name: nfd-worker-cert - mountPath: "/etc/kubernetes/node-feature-discovery/certs" - readOnly: true -{{- end }} - volumes: - - name: host-boot - hostPath: - path: "/boot" - - name: host-os-release - hostPath: - path: "/etc/os-release" - - name: host-sys - hostPath: - path: "/sys" - - name: host-usr-lib - hostPath: - path: "/usr/lib" - - name: host-lib - hostPath: - path: "/lib" - {{- if .Values.worker.mountUsrSrc }} - - name: host-usr-src - hostPath: - path: "/usr/src" - {{- end }} - - name: source-d - hostPath: - path: "/etc/kubernetes/node-feature-discovery/source.d/" - - name: features-d - hostPath: - path: "/etc/kubernetes/node-feature-discovery/features.d/" - - name: nfd-worker-conf - configMap: - name: {{ include "node-feature-discovery.fullname" . }}-worker-conf - items: - - key: nfd-worker.conf - path: nfd-worker.conf -{{- if .Values.tls.enable }} - - name: nfd-worker-cert - secret: - secretName: nfd-worker-cert -{{- end }} - {{- with .Values.worker.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.worker.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.worker.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.worker.priorityClassName }} - priorityClassName: {{ . | quote }} - {{- end }} diff --git a/roles/nfd_install/charts/node-feature-discovery/values.yaml b/roles/nfd_install/charts/node-feature-discovery/values.yaml index ee9b25e1..d94df8b0 100644 --- a/roles/nfd_install/charts/node-feature-discovery/values.yaml +++ b/roles/nfd_install/charts/node-feature-discovery/values.yaml @@ -93,10 +93,6 @@ master: nodeSelector: {} tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "" - effect: "NoSchedule" - key: "node-role.kubernetes.io/control-plane" operator: "Equal" value: "" @@ -107,12 +103,6 @@ master: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - preference: - matchExpressions: - - key: "node-role.kubernetes.io/master" - operator: In - values: [""] - weight: 1 preference: matchExpressions: diff --git a/roles/nfd_install/defaults/main.yml b/roles/nfd_install/defaults/main.yml index ff0e4f59..22dca5ea 100644 --- a/roles/nfd_install/defaults/main.yml +++ b/roles/nfd_install/defaults/main.yml @@ -15,8 +15,11 @@ ## --- nfd_image: "registry.k8s.io/nfd/node-feature-discovery" -nfd_image_tag: "v0.13.1-minimal" +nfd_version: "v0.14.3" +nfd_image_tag: "{{ nfd_version }}-minimal" +nfd_helm_name: "nfd" +nfd_helm_repo: "https://kubernetes-sigs.github.io/node-feature-discovery/charts" nfd_namespace: "kube-system" nfd_sa_create: true diff --git a/roles/nfd_install/tasks/main.yml b/roles/nfd_install/tasks/main.yml index 93d497d2..931fa638 100644 --- a/roles/nfd_install/tasks/main.yml +++ b/roles/nfd_install/tasks/main.yml @@ -23,18 +23,10 @@ changed_when: false when: inventory_hostname == groups['kube_control_plane'][0] -- name: copy NFD Helm chart to the controller node - copy: - src: "{{ (role_path, 'charts', 'node-feature-discovery') | path_join }}" - dest: "{{ (project_root_dir, 'charts') | path_join }}" - mode: 0755 - when: inventory_hostname == groups['kube_control_plane'][0] - -- name: add correct appVersion to Chart.yml - replace: - path: "{{ (project_root_dir, 'charts', 'node-feature-discovery', 'Chart') | path_join }}.yaml" - regexp: '^appVersion: (.*)$' - replace: 'appVersion: "{{ nfd_image_tag }}"' +- name: add NFD helm repo + kubernetes.core.helm_repository: + name: "{{ nfd_helm_name }}" + repo_url: "{{ nfd_helm_repo }}" when: inventory_hostname == groups['kube_control_plane'][0] - name: populate NFD Helm chart values template and push to controller node @@ -59,11 +51,12 @@ - inventory_hostname == groups['kube_control_plane'][0] - on_cloud is defined and on_cloud -- name: Deploy NFD chart using values files on target +- name: Deploy NFD kubernetes.core.helm: name: node-feature-discovery release_state: present - chart_ref: "{{ (project_root_dir, 'charts', 'node-feature-discovery') | path_join }}" + chart_ref: "nfd/node-feature-discovery" + chart_version: "{{ nfd_version }}" release_namespace: "{{ nfd_namespace }}" values_files: "{{ (project_root_dir, 'charts', 'nfd-values.yml') | path_join }}" wait: yes diff --git a/roles/nfd_install/templates/helm_values.yml.j2 b/roles/nfd_install/templates/helm_values.yml.j2 index 7896ad74..3b212ba8 100644 --- a/roles/nfd_install/templates/helm_values.yml.j2 +++ b/roles/nfd_install/templates/helm_values.yml.j2 @@ -9,7 +9,9 @@ imagePullSecrets: [] nameOverride: "" fullnameOverride: "" +namespaceOverride: "" +enableNodeFeatureApi: true master: instance: @@ -20,23 +22,15 @@ master: {% endif %} {% if gpu_dp_enabled | default(false) %} - "gpu.intel.com" -{% endif %} - resourceLabels: -{% if sgx_dp_enabled | default(false) %} - - "sgx.intel.com/epc" -{% endif %} -{% if gpu_dp_enabled | default(false) %} - - "gpu.intel.com/memory.max" - - "gpu.intel.com/millicores" {% endif %} {% else %} extraLabelNs: [] - resourceLabels: [] {% endif %} featureApi: crdController: null featureRulesController: null + nfdApiParallelism: null deploymentAnnotations: {} replicaCount: 1 @@ -82,10 +76,6 @@ master: nodeSelector: {} tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "" - effect: "NoSchedule" - key: "node-role.kubernetes.io/control-plane" operator: "Equal" value: "" @@ -96,12 +86,6 @@ master: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - preference: - matchExpressions: - - key: "node-role.kubernetes.io/master" - operator: In - values: [""] - weight: 1 preference: matchExpressions: @@ -113,6 +97,10 @@ worker: config: core: sleepInterval: {{ nfd_sleep_interval |d("60s") }} + labelSources: [all] + sources: + local: + hooksEnabled: true daemonsetAnnotations: {} podSecurityContext: {} diff --git a/roles/nfd_install/templates/node-feature-rules.yml.j2 b/roles/nfd_install/templates/node-feature-rules.yml.j2 index e847b36c..6d11d8cf 100644 --- a/roles/nfd_install/templates/node-feature-rules.yml.j2 +++ b/roles/nfd_install/templates/node-feature-rules.yml.j2 @@ -12,7 +12,7 @@ spec: - feature: pci.device matchExpressions: vendor: {op: In, value: ["8086"]} - device: {op: In, value: ["2710"]} + device: {op: In, value: ["2710", "2714"]} class: {op: In, value: ["0b40"]} - feature: kernel.loadedmodule matchExpressions: @@ -34,13 +34,25 @@ spec: {% endif %} {% if gpu_dp_enabled | d(false) %} - name: "intel.gpu" +{% if gas_enabled | d(false) %} + extendedResources: + gpu.intel.com/millicores: "@local.label.gpu.intel.com/millicores" + gpu.intel.com/memory.max: "@local.label.gpu.intel.com/memory.max" + gpu.intel.com/tiles: "@local.label.gpu.intel.com/tiles" + matchFeatures: + - feature: local.label + matchExpressions: + gpu.intel.com/millicores: {op: Exists} + gpu.intel.com/memory.max: {op: Exists} + gpu.intel.com/tiles: {op: Exists} +{% endif %} labels: "intel.feature.node.kubernetes.io/gpu": "true" matchFeatures: - feature: pci.device matchExpressions: vendor: {op: In, value: ["8086"]} - class: {op: In, value: ["0380"]} + class: {op: In, value: ["0300", "0380"]} - feature: kernel.loadedmodule matchExpressions: drm: {op: Exists} @@ -63,6 +75,8 @@ spec: {% endif %} {% if sgx_dp_enabled | d(false) %} - name: "intel.sgx" + extendedResources: + sgx.intel.com/epc: "@cpu.security.sgx.epc" labels: "intel.feature.node.kubernetes.io/sgx": "true" matchFeatures: @@ -70,10 +84,10 @@ spec: matchExpressions: SGX: {op: Exists} SGXLC: {op: Exists} - - feature: cpu.sgx + - feature: cpu.security matchExpressions: - enabled: {op: IsTrue} -{% if not (ansible_distribution == "Ubuntu" and ansible_distribution_version == "20.04") %} + sgx.enabled: {op: IsTrue} +{% if not (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('20.04', '==')) %} - feature: kernel.config matchExpressions: X86_SGX: {op: Exists} diff --git a/roles/openssl_engine_install/defaults/main.yml b/roles/openssl_engine_install/defaults/main.yml index bc062e84..e80693ff 100644 --- a/roles/openssl_engine_install/defaults/main.yml +++ b/roles/openssl_engine_install/defaults/main.yml @@ -16,9 +16,9 @@ --- openssl_engine_dir: "{{ (project_root_dir, 'openssl') | path_join }}" openssl_engine_url: "https://github.com/intel/QAT_Engine.git" -openssl_engine_version: "v1.3.1" +openssl_engine_version: "v1.4.0" libarchive_url: "https://github.com/libarchive/libarchive/releases/download/v3.5.1/libarchive-3.5.1.tar.xz" ipp_crypto_url: "https://github.com/intel/ipp-crypto.git" -ipp_crypto_version: "ippcp_2021.7.1" +ipp_crypto_version: "ippcp_2021.8" intel_ipsec_url: "https://github.com/intel/intel-ipsec-mb.git" intel_ipsec_version: "v1.4" diff --git a/roles/openssl_engine_install/tasks/main.yml b/roles/openssl_engine_install/tasks/main.yml index d1ea0bdb..ed9c18ea 100644 --- a/roles/openssl_engine_install/tasks/main.yml +++ b/roles/openssl_engine_install/tasks/main.yml @@ -19,17 +19,17 @@ path: "{{ openssl_engine_dir }}" register: engine_stat_result when: - - inventory_hostname == groups['kube_node'][0] + - inventory_hostname in groups['kube_node'] - name: enable OpenSSL*Engine include_tasks: openssl_engine_config.yml when: - - inventory_hostname == groups['kube_node'][0] + - inventory_hostname in groups['kube_node'] - engine_stat_result.stat.exists - name: OpenSSL*Engine enablement skipped debug: msg: "OpenSSL*Engine enablement skipped, need to set openssl_install to true in host vars" when: - - inventory_hostname == groups['kube_node'][0] + - inventory_hostname in groups['kube_node'] - not engine_stat_result.stat.exists diff --git a/roles/openssl_engine_install/tasks/openssl_engine_config.yml b/roles/openssl_engine_install/tasks/openssl_engine_config.yml index 6d933724..85caf834 100644 --- a/roles/openssl_engine_install/tasks/openssl_engine_config.yml +++ b/roles/openssl_engine_install/tasks/openssl_engine_config.yml @@ -24,7 +24,7 @@ path: "{{ openssl_engine_dir }}/libarchive_install" state: directory mode: "u=rwx,g=rx,o=rx" - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.3' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=') - name: download and unarchive libarchive required version for Rocky / RHEL >= 8.3 unarchive: @@ -35,14 +35,14 @@ register: libarchive_rhel_version until: libarchive_rhel_version is not failed retries: 5 - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.3' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=') - name: configure libarchive with PATH in Rocky / RHEL >= 8.3 command: './configure --prefix=/usr --disable-static' args: chdir: "{{ openssl_engine_dir }}/libarchive_install/libarchive-3.5.1" changed_when: true - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.3' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=') - name: detect number of cores on system command: nproc @@ -55,7 +55,7 @@ target: install environment: "MAKEFLAGS": "-j{{ nproc_out.stdout | int }}" - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.3' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.3', '>=') # Building Intel® Integrated Performance Primitives Cryptography - name: create directory for Intel IPPC dependencies @@ -114,7 +114,7 @@ when: - configure_tdx | default(false) - not on_vms | default(false) - - ansible_distribution == "Ubuntu" and ansible_distribution_version == '22.04' + - ansible_distribution == "Ubuntu" and ansible_distribution_version is version('22.04', '==') - name: build Intel MBC-IPsec Library command: 'make -j{{ nproc_out.stdout | int }} SAFE_DATA=y SAFE_PARAM=y SAFE_LOOKUP=y' diff --git a/roles/opentelemetry_install/defaults/main.yml b/roles/opentelemetry_install/defaults/main.yml index 3d50ab2b..0c984b6e 100644 --- a/roles/opentelemetry_install/defaults/main.yml +++ b/roles/opentelemetry_install/defaults/main.yml @@ -17,10 +17,10 @@ opentelemetry_repo: "https://open-telemetry.github.io/opentelemetry-helm-charts" opentelemetry_operator_namespace: "monitoring" opentelemetry_operator_chart_name: "opentelemetry-operator" -opentelemetry_operator_chart_version: "0.35.1" +opentelemetry_operator_chart_version: "0.43.0" opentelemetry_collectors: gateway: true cadvisor: "{{ cadvisor_enabled | default(false) }}" telegraf: "{{ telegraf_enabled | default(false) }}" - elasticsearch: "{{ elasticsearch_enabled | default(false) }}" + elasticsearch: "{{ eck_enabled | default(false) }}" diff --git a/roles/opentelemetry_install/tasks/cleanup.yml b/roles/opentelemetry_install/tasks/cleanup.yml index e6ddfa77..faca1264 100644 --- a/roles/opentelemetry_install/tasks/cleanup.yml +++ b/roles/opentelemetry_install/tasks/cleanup.yml @@ -31,7 +31,7 @@ - name: get opentelemetry secrets ansible.builtin.shell: - cmd: kubectl get secrets -n monitoring | grep -E 'otel|opentelemetry' | awk '{ print $1 }' + cmd: set -o pipefail && kubectl get secrets -n monitoring | grep -E 'otel|opentelemetry' | awk '{ print $1 }' args: executable: /bin/bash register: otel_secrets diff --git a/roles/opentelemetry_install/tasks/main.yml b/roles/opentelemetry_install/tasks/main.yml index d180b2bc..6f6b8eaf 100644 --- a/roles/opentelemetry_install/tasks/main.yml +++ b/roles/opentelemetry_install/tasks/main.yml @@ -67,8 +67,8 @@ - name: get Elasticsearch credentials ansible.builtin.shell: >- - kubectl get secrets --namespace=monitoring - elasticsearch-master-credentials -ojsonpath='{.data.password}' | base64 -d + set -o pipefail && kubectl get secrets --namespace=monitoring + elasticsearch-main-es-elastic-user -ojsonpath='{.data.elastic}' | base64 -d args: executable: /bin/bash changed_when: false diff --git a/roles/opentelemetry_install/tasks/preflight.yml b/roles/opentelemetry_install/tasks/preflight.yml new file mode 100644 index 00000000..e9bb0779 --- /dev/null +++ b/roles/opentelemetry_install/tasks/preflight.yml @@ -0,0 +1,24 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: Check dependencies for opentelemetry + ansible.builtin.assert: + that: + - jaeger_enabled | default(false) + - prometheus_stack_enabled | default(false) + - eck_enabled | default(false) + fail_msg: + "When OpenTelemetry is enabled, following components must be enabled as well: jaeger, prometheus_stack, elasticsearch." + when: opentelemetry_enabled | default(false) diff --git a/roles/opentelemetry_install/templates/otel-gateway.yaml.j2 b/roles/opentelemetry_install/templates/otel-gateway.yaml.j2 index 4360b888..8ef18272 100644 --- a/roles/opentelemetry_install/templates/otel-gateway.yaml.j2 +++ b/roles/opentelemetry_install/templates/otel-gateway.yaml.j2 @@ -43,7 +43,7 @@ spec: loglevel: info elasticsearch/log: - endpoints: [https://elasticsearch-master.monitoring.svc:9200] + endpoints: [https://elasticsearch-main-es-http.monitoring.svc:9200] tls: cert_file: /var/run/secrets/otel-elasticsearch-tls/tls.crt key_file: /var/run/secrets/otel-elasticsearch-tls/tls.key @@ -52,7 +52,7 @@ spec: user: elastic password: {{ es_otel_credentials.stdout }} - jaeger: + otlp: endpoint: jaeger-collector-headless.monitoring.svc:14250 tls: insecure: true @@ -71,7 +71,7 @@ spec: traces: receivers: [otlp] processors: [batch] - exporters: [logging,jaeger] + exporters: [logging,otlp] metrics: receivers: [otlp] processors: [batch] diff --git a/roles/operator_framework/defaults/main.yml b/roles/operator_framework/defaults/main.yml index 18382b68..a837d3e9 100644 --- a/roles/operator_framework/defaults/main.yml +++ b/roles/operator_framework/defaults/main.yml @@ -14,7 +14,7 @@ ## limitations under the License. ## operator_sdk_git: "https://github.com/operator-framework/operator-sdk.git" -operator_sdk_git_ref: "v1.26.0" +operator_sdk_git_ref: "v1.32.0" operator_sdk_dir: "{{ (project_root_dir, 'operator-sdk') | path_join }}" -operator_lm_version: "v0.22.0" +operator_lm_version: "v0.26.0" diff --git a/roles/operator_framework/tasks/main.yml b/roles/operator_framework/tasks/main.yml index ccbe9eb9..e0e87cfc 100644 --- a/roles/operator_framework/tasks/main.yml +++ b/roles/operator_framework/tasks/main.yml @@ -40,7 +40,7 @@ chdir: "{{ operator_sdk_dir }}" - name: install Operator Lifecycle Manager (OLM) - command: "operator-sdk olm install --version {{ operator_lm_version }}" + command: "operator-sdk olm install --version {{ operator_lm_version }} --timeout 4m" environment: PATH: "{{ gopath.stdout }}/bin:/usr/local/go/bin:/usr/sbin:/usr/bin:/sbin:/bin:{{ operator_sdk_dir }}" changed_when: true @@ -50,7 +50,7 @@ args: executable: /bin/bash register: olm_pods_status - retries: 30 + retries: 3 delay: 10 until: - "'Error' not in olm_pods_status.stdout" diff --git a/roles/platform_aware_scheduling_install/charts/gpu-aware-scheduling/templates/gas-deployment.yaml b/roles/platform_aware_scheduling_install/charts/gpu-aware-scheduling/templates/gas-deployment.yaml index d43212a0..e0340667 100644 --- a/roles/platform_aware_scheduling_install/charts/gpu-aware-scheduling/templates/gas-deployment.yaml +++ b/roles/platform_aware_scheduling_install/charts/gpu-aware-scheduling/templates/gas-deployment.yaml @@ -49,17 +49,12 @@ spec: secretName: {{ .Values.tls.secretName }} {{- end}} tolerations: - - key: node-role.kubernetes.io/master - operator: Exists - key: node-role.kubernetes.io/control-plane operator: Exists affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: node-role.kubernetes.io/master - operator: Exists - matchExpressions: - key: node-role.kubernetes.io/control-plane operator: Exists diff --git a/roles/prometheus_install/defaults/main.yml b/roles/prometheus_install/defaults/main.yml index 309f19e8..dad4b4c2 100644 --- a/roles/prometheus_install/defaults/main.yml +++ b/roles/prometheus_install/defaults/main.yml @@ -13,6 +13,6 @@ ## See the License for the specific language governing permissions and ## limitations under the License. ## -prometheus_stack_version: 2.47.0 -grafana_version: 10.1.2 -node_exporter_version: 1.6.1 +prometheus_stack_version: 2.48.0 +grafana_version: 10.2.2 +node_exporter_version: 1.7.0 diff --git a/roles/prometheus_install/kube_prometheus/defaults/main.yml b/roles/prometheus_install/kube_prometheus/defaults/main.yml index c20c5461..f7a094ba 100644 --- a/roles/prometheus_install/kube_prometheus/defaults/main.yml +++ b/roles/prometheus_install/kube_prometheus/defaults/main.yml @@ -16,15 +16,14 @@ kube_prometheus_stack_directory: "{{ (project_root_dir, 'kube-prometheus-stack') | path_join }}" kube_prometheus_stack_namespace: monitoring -prometheus_operator_version: 0.68.0 -kube_state_metrics_version: 2.10.0 +prometheus_operator_version: 0.69.1 +kube_state_metrics_version: 2.10.1 tas_demo_policy_dir: "{{ (project_root_dir, 'tas-demo-policy') | path_join }}" # expose prometheus server API prometheus_srv_expose: false prometheus_srv_proxy_port: 9443 -prometheus_srv_node_port: 30443 prometheus_srv_address: 127.0.0.1 prometheus_srv_nginx_image: "docker.io/library/nginx:1.24.0-alpine" prometheus_srv_nginx_ssl_ciphers: @@ -33,7 +32,6 @@ prometheus_srv_nginx_ssl_ciphers: prometheus_srv_nginx_ssl_protocols: "TLSv1.2 TLSv1.3" prometheus_srv_location_exposed: "/prometheus/" prometheus_srv_user: prometheus -prometheus_srv_password: prometheus prometheus_stack_version_files: - "{{ kube_prometheus_stack_directory }}/kube-prometheus-storage-class.yml" diff --git a/roles/prometheus_install/kube_prometheus/tasks/main.yml b/roles/prometheus_install/kube_prometheus/tasks/main.yml index 327c6801..c37922f6 100644 --- a/roles/prometheus_install/kube_prometheus/tasks/main.yml +++ b/roles/prometheus_install/kube_prometheus/tasks/main.yml @@ -90,9 +90,14 @@ executable: /bin/bash changed_when: true + - name: create password for prometheus + ansible.builtin.set_fact: + prometheus_srv_password: "{{ lookup('ansible.builtin.password', '/dev/null', seed=inventory_hostname) }}" + - name: generate htpasswd file ansible.builtin.command: htpasswd -Bbn {{ prometheus_srv_user }} {{ prometheus_srv_password }} register: htpasswd + changed_when: true when: - prometheus_srv_expose | default(false) diff --git a/roles/prometheus_install/kube_prometheus/templates/grafana-service.yaml.j2 b/roles/prometheus_install/kube_prometheus/templates/grafana-service.yaml.j2 index 2524d7bf..c5eabbb4 100644 --- a/roles/prometheus_install/kube_prometheus/templates/grafana-service.yaml.j2 +++ b/roles/prometheus_install/kube_prometheus/templates/grafana-service.yaml.j2 @@ -9,11 +9,9 @@ metadata: name: grafana namespace: monitoring spec: - type: NodePort ports: - name: grafana-https port: 3000 - nodePort: 30000 targetPort: grafana-https selector: app.kubernetes.io/component: grafana diff --git a/roles/prometheus_install/kube_prometheus/templates/kube-prometheus-stack-certs.yml.j2 b/roles/prometheus_install/kube_prometheus/templates/kube-prometheus-stack-certs.yml.j2 index bc1a13bc..85fded59 100644 --- a/roles/prometheus_install/kube_prometheus/templates/kube-prometheus-stack-certs.yml.j2 +++ b/roles/prometheus_install/kube_prometheus/templates/kube-prometheus-stack-certs.yml.j2 @@ -46,6 +46,9 @@ spec: {% else %} - {{ hostvars[host]['ansible_default_ipv4']['address'] }} {% endif %} +{% if calico_vpp.enabled | default(false) %} + - {{ hostvars[host]['ansible_' + calico_vpp.interface_name]['ipv4']['address'] }} +{% endif %} {% endfor %} - 127.0.0.1 isCA: false @@ -75,6 +78,9 @@ spec: {% else %} - {{ hostvars[host]['ansible_default_ipv4']['address'] }} {% endif %} +{% if calico_vpp.enabled | default(false) %} + - {{ hostvars[host]['ansible_' + calico_vpp.interface_name]['ipv4']['address'] }} +{% endif %} {% endfor %} - 127.0.0.1 isCA: false @@ -105,6 +111,9 @@ spec: {% else %} - {{ hostvars[host]['ansible_default_ipv4']['address'] }} {% endif %} +{% if calico_vpp.enabled | default(false) %} + - {{ hostvars[host]['ansible_' + calico_vpp.interface_name]['ipv4']['address'] }} +{% endif %} {% endfor %} - 127.0.0.1 isCA: false diff --git a/roles/prometheus_install/kube_prometheus/templates/prometheus_srv/prometheus-srv-service.yaml.j2 b/roles/prometheus_install/kube_prometheus/templates/prometheus_srv/prometheus-srv-service.yaml.j2 index 65523910..beb313cb 100644 --- a/roles/prometheus_install/kube_prometheus/templates/prometheus_srv/prometheus-srv-service.yaml.j2 +++ b/roles/prometheus_install/kube_prometheus/templates/prometheus_srv/prometheus-srv-service.yaml.j2 @@ -7,13 +7,11 @@ metadata: labels: prometheus: k8s spec: - type: NodePort ports: - port: {{ prometheus_srv_proxy_port }} protocol: TCP name: nginx-https targetPort: nginx-https - nodePort: {{ prometheus_srv_node_port }} selector: prometheus: k8s app.kubernetes.io/component: prometheus diff --git a/roles/redeploy_cleanup/tasks/intel_cleanup.yml b/roles/redeploy_cleanup/tasks/intel_cleanup.yml index df8558d7..04838de8 100644 --- a/roles/redeploy_cleanup/tasks/intel_cleanup.yml +++ b/roles/redeploy_cleanup/tasks/intel_cleanup.yml @@ -38,7 +38,8 @@ block: - name: find ddp-i40e service shell: - cmd: systemctl list-unit-files | grep ddp-i40e | cut -d" " -f1 # noqa command-instead-of-module - systemctl is used intentionally here + cmd: set -o pipefail && systemctl list-unit-files | grep ddp-i40e | cut -d" " -f1 # noqa command-instead-of-module - systemctl is used intentionally + executable: /bin/bash register: ddp_i40e_service changed_when: false failed_when: false @@ -194,7 +195,7 @@ failed_when: false when: - ansible_os_family == "RedHat" - - ansible_distribution_version >= "9.0" + - ansible_distribution_version is version('9.0', '>=') - configure_sgx | default(false) - name: resume disks @@ -245,7 +246,7 @@ when: - configure_tdx | default(false) | bool - ansible_distribution == "Ubuntu" - - ansible_distribution_version == "22.04" + - ansible_distribution_version is version('22.04', '==') - debug: msg: "Intel Container Experience Kit features have been removed ..." diff --git a/roles/redeploy_cleanup/tasks/main_k8s.yml b/roles/redeploy_cleanup/tasks/main_k8s.yml index 40392a7c..27056352 100644 --- a/roles/redeploy_cleanup/tasks/main_k8s.yml +++ b/roles/redeploy_cleanup/tasks/main_k8s.yml @@ -23,12 +23,12 @@ when: kubernetes | default(false) | bool -- name: uninstall elasticsearch +- name: uninstall eck ansible.builtin.include_role: - name: elasticsearch_install + name: eck_install tasks_from: cleanup tags: - - elasticsearch + - eck - name: uninstall opentelemetry ansible.builtin.include_role: @@ -143,6 +143,19 @@ tags: - intel-oneapi +- name: Cleanup KubeVirt + ansible.builtin.include_role: + name: kubevirt_install + tasks_from: cleanup + tags: + - kubevirt + +- name: Cleanup Ingress Nginx + ansible.builtin.include_role: + name: ingress_nginx_install + tasks_from: cleanup + tags: ingress-nginx + - name: reset and remove Kubernetes cluster ansible.builtin.import_tasks: k8s_cleanup.yml when: kube_provisioner == "kubespray" diff --git a/roles/redeploy_cleanup/tasks/rke2_cleanup.yml b/roles/redeploy_cleanup/tasks/rke2_cleanup.yml index 611db55e..de6be4f5 100644 --- a/roles/redeploy_cleanup/tasks/rke2_cleanup.yml +++ b/roles/redeploy_cleanup/tasks/rke2_cleanup.yml @@ -33,6 +33,7 @@ - rke2_uninstall_sh.stat.exists - inventory_hostname == groups['kube_control_plane'][0] register: result + changed_when: true failed_when: "'error' in result.stderr" - name: remove rke2 cluster files diff --git a/roles/rke2_defaults/tasks/rke2_preflight.yml b/roles/rke2_defaults/tasks/rke2_preflight.yml index 73dfc0fd..36d26bb8 100644 --- a/roles/rke2_defaults/tasks/rke2_preflight.yml +++ b/roles/rke2_defaults/tasks/rke2_preflight.yml @@ -17,7 +17,7 @@ - name: check linux distro version and kernel for RKE2 ansible.builtin.assert: that: > - - (ansible_distribution == 'Ubuntu' and ansible_distribution_version == '22.04') + - (ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('22.04', '==')) msg: - "RKE2 is supported only on Ubuntu 22.04 with RA" diff --git a/roles/rke2_kubernetes_apps/cert_manager_install/defaults/main.yml b/roles/rke2_kubernetes_apps/cert_manager_install/defaults/main.yml index d64cbfad..73cb5416 100644 --- a/roles/rke2_kubernetes_apps/cert_manager_install/defaults/main.yml +++ b/roles/rke2_kubernetes_apps/cert_manager_install/defaults/main.yml @@ -14,5 +14,5 @@ ## limitations under the License. ## --- -cert_manager_version: "v1.11.1" +cert_manager_version: "v1.12.5" cert_manager_crd_url: "https://github.com/cert-manager/cert-manager/releases/download/{{ cert_manager_version }}/cert-manager.crds.yaml" diff --git a/roles/rke2_kubernetes_apps/helm/defaults/main.yml b/roles/rke2_kubernetes_apps/helm/defaults/main.yml index 3e846b63..82ada9d0 100644 --- a/roles/rke2_kubernetes_apps/helm/defaults/main.yml +++ b/roles/rke2_kubernetes_apps/helm/defaults/main.yml @@ -26,6 +26,6 @@ host_architecture: >- {%- endif -%} image_arch: "{{host_architecture | default('amd64')}}" -helm_version: "v3.12.3" +helm_version: "v3.13.1" helm_download_url: "https://get.helm.sh/helm-{{ helm_version }}-linux-{{ image_arch }}.tar.gz" helm_dest: "{{ rke2_root_dir }}/helm-{{ helm_version }}" diff --git a/roles/rke2_kubernetes_apps/rancher/defaults/main.yml b/roles/rke2_kubernetes_apps/rancher/defaults/main.yml index 213cdc16..5fd4e07a 100644 --- a/roles/rke2_kubernetes_apps/rancher/defaults/main.yml +++ b/roles/rke2_kubernetes_apps/rancher/defaults/main.yml @@ -15,4 +15,4 @@ ## --- rancher_namespace: cattle-system -rancher_version: '2.7.3' +rancher_version: '2.7.9' diff --git a/roles/rook_install/defaults/main.yml b/roles/rook_install/defaults/main.yml index 2b9b6c0d..66736175 100755 --- a/roles/rook_install/defaults/main.yml +++ b/roles/rook_install/defaults/main.yml @@ -18,6 +18,8 @@ rook_namespace: "rook-ceph" # CPU control plane namespace rook_release_name: "rook-ceph" # CPU control plane release name rook_git_url: "https://github.com/rook/rook.git" -rook_git_tag: "v1.10.10" +rook_git_tag: "v1.12.9" rook_git_local_dir: "{{ (project_root_dir, 'rook') | path_join }}" rook_helm_local_dir: "{{ (project_root_dir, 'charts', 'rook') | path_join }}" + +ceph_version: v17.2.6 diff --git a/roles/rook_install/tasks/install_rook_helmchart.yml b/roles/rook_install/tasks/install_rook_helmchart.yml index a1dbac03..88735156 100755 --- a/roles/rook_install/tasks/install_rook_helmchart.yml +++ b/roles/rook_install/tasks/install_rook_helmchart.yml @@ -42,6 +42,7 @@ - "csi/nfs/rbac.yaml" - "filesystem-test.yaml" - "object-test.yaml" + - "nfs-test.yaml" - name: populate Rook Operator templates and copy to controller node vars: @@ -98,6 +99,25 @@ when: - rook_ceph.storage_type == "cephfs" +- name: block to deploy rook cephfs storage + block: + - name: populate cephfs storage class templates and copy to controller node + ansible.builtin.template: + src: "nfs-storageclass.yaml.j2" + dest: "{{ (rook_helm_local_dir, 'temp', 'nfs-storageclass.yaml') | path_join }}" + force: true + trim_blocks: false + mode: preserve + - name: k8s deploy rook-ceph cephfs filesystem and storageclass + kubernetes.core.k8s: + state: present + src: "{{ (rook_helm_local_dir, 'temp', item ) | path_join }}" + loop: + - "nfs-test.yaml" + - "nfs-storageclass.yaml" + when: + - rook_ceph.storage_type == "nfs" + - name: block to deploy rook-ceph block storage block: - name: populate block storage class templates and copy to controller node @@ -133,6 +153,24 @@ when: - rook_ceph.storage_type == "object" +- name: block to deploy persistent volume claim + block: + - name: populate persistent volume claim + ansible.builtin.template: + src: "pvc.yaml.j2" + dest: "{{ (rook_helm_local_dir, 'temp', 'pvc.yaml') | path_join }}" + force: true + trim_blocks: false + mode: preserve + - name: k8s deploy persistent volume claim + kubernetes.core.k8s: + state: present + src: "{{ (rook_helm_local_dir, 'temp', 'pvc.yaml' ) | path_join }}" + when: + - rook_ceph.storage_type == "object" or + rook_ceph.storage_type == "nfs" or + rook_ceph.storage_type == "cephfs" + # debug tools, set the ignore_errors to true - name: install rook-ceph kubectl plugin for status checking ansible.builtin.command: >- diff --git a/roles/rook_install/templates/block-storageclass.yaml.j2 b/roles/rook_install/templates/block-storageclass.yaml.j2 index 3e32f606..6aca3c90 100644 --- a/roles/rook_install/templates/block-storageclass.yaml.j2 +++ b/roles/rook_install/templates/block-storageclass.yaml.j2 @@ -54,4 +54,4 @@ parameters: # uncomment the following to use rbd-nbd as mounter on supported nodes # mounter: rbd-nbd allowVolumeExpansion: true -reclaimPolicy: delete +reclaimPolicy: Delete diff --git a/roles/rook_install/templates/cluster.yaml.j2 b/roles/rook_install/templates/cluster.yaml.j2 index daaf238a..526bd030 100644 --- a/roles/rook_install/templates/cluster.yaml.j2 +++ b/roles/rook_install/templates/cluster.yaml.j2 @@ -21,7 +21,7 @@ spec: # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/. # If you want to be more precise, you can always use a timestamp tag such quay.io/ceph/ceph:v17.2.3-20220805 # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities - image: quay.io/ceph/ceph:v17.2.5 + image: quay.io/ceph/ceph:{{ ceph_version }} # Whether to allow unsupported versions of Ceph. Currently `pacific` and `quincy` are supported. # Future versions such as `reef` (v18) would require this to be set to `true`. # Do not set to true in production. diff --git a/roles/rook_install/templates/nfs-storageclass.yaml.j2 b/roles/rook_install/templates/nfs-storageclass.yaml.j2 new file mode 100644 index 00000000..11484144 --- /dev/null +++ b/roles/rook_install/templates/nfs-storageclass.yaml.j2 @@ -0,0 +1,48 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: {{ rook_ceph.storage_class }} +# Change "rook-ceph" provisioner prefix to match the operator namespace if needed +provisioner: rook-ceph.nfs.csi.ceph.com +parameters: + # nfsCluster is the name of the NFS cluster as managed by Ceph (sometimes called the NFS cluster ID). + # With Rook, this should get the name of the CephNFS resource. + nfsCluster: my-nfs + + # server is the host name, ip address, or Kubernetes Service that points to the Ceph NFS server + # used for mounting the NFS-export. + # With Rook, a Kubernetes Service named with the pattern "rook-ceph--a" will + # always be created and can be used here. This is where name-of-cephnfs refers to the name of the + # CephNFS resource used for nfsCluster above. + server: rook-ceph-nfs-my-nfs-a + + # clusterID is the Kubernetes namespace where the CephCluster is running + # If you change this namespace, also change the namespace below where the secret namespaces are defined + clusterID: rook-ceph # namespace:cluster + + # CephFS filesystem name into which the volume shall be created + # With Rook, this will be the name of the CephFilesystem resource used to back NFS exports. + fsName: myfs + + # Ceph pool into which the volume shall be created + # Required for provisionVolume: "true" + # With Rook, this will be "-" where filesystem-name is the name of the + # CephFilesystem used in fsName and where pool-name refers to the name of a data pool defined for + # the CephFilesystem used for fsName above. + pool: myfs-replicated + + # The secrets contain Ceph admin credentials. These are generated automatically by the Rook + # operator in the same namespace as the cluster. Note that the NFS provisioner shares its secrets + # with the CephFS CSI provisioner. + csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph # namespace:cluster + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph # namespace:cluster + csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node + csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph # namespace:cluster + +reclaimPolicy: Delete +allowVolumeExpansion: true +mountOptions: + # uncomment the following line for debugging + #- debug diff --git a/roles/rook_install/templates/pvc.yaml.j2 b/roles/rook_install/templates/pvc.yaml.j2 new file mode 100644 index 00000000..f5c8e086 --- /dev/null +++ b/roles/rook_install/templates/pvc.yaml.j2 @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: cephfs-pvc + namespace: default +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + storageClassName: {{ rook_ceph.storage_class }} diff --git a/roles/sgx_dp_install/tasks/main.yml b/roles/sgx_dp_install/tasks/main.yml index 1ebbe15b..5e14f52b 100644 --- a/roles/sgx_dp_install/tasks/main.yml +++ b/roles/sgx_dp_install/tasks/main.yml @@ -21,57 +21,48 @@ - inventory_hostname == groups['kube_node'][0] # docker is used as container runtime: -- name: Prepare containers images +- name: Build and push Intel SGX Device Plugin images - docker when: - inventory_hostname == groups['kube_node'][0] - container_runtime == "docker" + - sgx_dp_build_image_locally block: - name: Build Intel SGX Device Plugin images - make: + community.general.make: target: "{{ item }}" chdir: "{{ intel_dp_dir }}" loop: - intel-sgx-plugin - intel-sgx-initcontainer - when: sgx_dp_build_image_locally - - name: Tag Intel SGX Device Plugin images - command: docker tag intel/{{ item }}:{{ intel_dp_version }} {{ registry_local_address }}/{{ item }}:{{ intel_dp_version }} + - name: Push Intel SGX Device Plugin images + community.docker.docker_image: + name: "intel/{{ item }}:{{ intel_dp_version }}" + repository: "{{ registry_local_address }}/{{ item }}:{{ intel_dp_version }}" + push: true + source: local loop: - intel-sgx-plugin - intel-sgx-initcontainer - when: sgx_dp_build_image_locally - - name: Push Intel SGX Device Plugin images to local registry - command: docker push {{ registry_local_address }}/{{ item }}:{{ intel_dp_version }} - loop: - - intel-sgx-plugin - - intel-sgx-initcontainer - when: sgx_dp_build_image_locally # containerd/cri-o is used as container runtime: -- name: Prepare containers images +- name: Build and push Intel SGX Device Plugin images - podman + containers.podman.podman_image: + name: "{{ registry_local_address }}/{{ item }}" + tag: "{{ intel_dp_version }}" + path: "{{ intel_dp_dir }}" + build: + file: "build/docker/{{ item }}.Dockerfile" + push: true + state: build + loop: + - intel-sgx-plugin + - intel-sgx-initcontainer when: - inventory_hostname == groups['kube_node'][0] - - '"docker" not in container_runtime' - block: - - name: Build and tag Intel SGX Device Plugin images - command: podman build -f build/docker/{{ item.file }} -t {{ registry_local_address }}/{{ item.name }}:{{ intel_dp_version }} - args: - chdir: "{{ intel_dp_dir }}" - changed_when: true - with_items: - - {file: intel-sgx-initcontainer.Dockerfile, name: intel-sgx-initcontainer} - - {file: intel-sgx-plugin.Dockerfile, name: intel-sgx-plugin} - when: sgx_dp_build_image_locally - - - name: Push Intel SGX Device Plugin images to local registry - command: podman push {{ registry_local_address }}/{{ item }}:{{ intel_dp_version }} - changed_when: true - loop: - - intel-sgx-initcontainer - - intel-sgx-plugin - when: sgx_dp_build_image_locally + - container_runtime in ['containerd', 'crio'] + - sgx_dp_build_image_locally | default(false) - name: Prepare and deploy Intel SGX Device Plugin when: @@ -107,14 +98,14 @@ sgx_plugin_info.resources | length != 0 and (sgx_plugin_info.resources[0].status.numberReady | int) == (sgx_plugin_info.resources[0].status.desiredNumberScheduled | int) -- name: Build Intel sgx-aesmd demo image Docker engine +- name: Build and push Intel sgx-aesmd demo image - docker when: - sgx_aesmd_demo_enable | default(false) | bool - inventory_hostname == groups['kube_node'][0] - container_runtime == "docker" block: - name: Build Intel sgx-aesmd image - make: + community.general.make: target: sgx-aesmd-demo chdir: "{{ intel_dp_dir }}" retries: 5 @@ -122,29 +113,27 @@ register: build_aesmd until: build_aesmd is not failed - - name: Tag Intel sgx-aesmd image - command: docker tag intel/sgx-aesmd-demo:{{ intel_dp_version }} {{ registry_local_address }}/intel-sgx-aesmd-demo:{{ intel_dp_version }} - changed_when: true - - - name: Push Intel sgx-aesmd image to local registry - command: docker push {{ registry_local_address }}/intel-sgx-aesmd-demo:{{ intel_dp_version }} - changed_when: true - -- name: Build Intel sgx-aesmd demo image non-Docker engine + - name: Push Intel sgx-aesmd image + community.docker.docker_image: + name: "intel/sgx-aesmd-demo:{{ intel_dp_version }}" + repository: "{{ registry_local_address }}/intel-sgx-aesmd-demo:{{ intel_dp_version }}" + push: true + source: local + + +- name: Build and push Intel sgx-aesmd demo image - podman + containers.podman.podman_image: + name: "{{ registry_local_address }}/intel-sgx-aesmd-demo" + tag: "{{ intel_dp_version }}" + path: "{{ intel_dp_dir }}" + build: + file: "demo/sgx-aesmd-demo/Dockerfile" + push: true + state: build when: - sgx_aesmd_demo_enable | default(false) | bool - inventory_hostname == groups['kube_node'][0] - - '"docker" not in container_runtime' - block: - - name: Build and tag Intel sgx-aesmd image - command: podman build -f demo/sgx-aesmd-demo/Dockerfile -t {{ registry_local_address }}/intel-sgx-aesmd-demo:{{ intel_dp_version }} - args: - chdir: "{{ intel_dp_dir }}" - changed_when: true - - - name: Push Intel sgx-aesmd image to local registry - command: podman push {{ registry_local_address }}/intel-sgx-aesmd-demo:{{ intel_dp_version }} - changed_when: true + - container_runtime in ['containerd', 'crio'] - name: Prepare and deploy Intel SGX aesmd demo when: @@ -171,13 +160,12 @@ mode: 0755 - name: Install Intel SGX aesmd Helm chart - command: >- - helm upgrade -i intel-sgx-aesmd - -f {{ (project_root_dir, 'charts', 'intel-sgx-aesmd-demo-values.yml') | path_join }} - --namespace {{ sgx_aesmd_namespace }} - --create-namespace - {{ (project_root_dir, 'charts', 'intel-sgx-aesmd') | path_join }} - changed_when: true + kubernetes.core.helm: + chart_ref: "{{ (project_root_dir, 'charts', 'intel-sgx-aesmd') | path_join }}" + release_name: "intel-sgx-aesmd" + release_namespace: "{{ sgx_aesmd_namespace }}" + values_files: "{{ (project_root_dir, 'charts', 'intel-sgx-aesmd-demo-values.yml') | path_join }}" + create_namespace: true - name: Wait for SGX aesmd DaemonSet kubernetes.core.k8s_info: diff --git a/roles/sigstore_policy_controller/defaults/main.yml b/roles/sigstore_policy_controller/defaults/main.yml index 0d672ded..4c398fd5 100644 --- a/roles/sigstore_policy_controller/defaults/main.yml +++ b/roles/sigstore_policy_controller/defaults/main.yml @@ -26,7 +26,7 @@ container_registry_secret: container-registry-secret sigstore_chart_name: sigstore sigstore_chart_repo: https://sigstore.github.io/helm-charts -policy_controller_release: 0.6.5 +policy_controller_release: 0.6.7 sigstore_chart_tag: "policy-controller-{{ policy_controller_release }}" policy_controller_release_name: policy-controller policy_controller_dir: "{{ (project_root_dir, 'policy-controller') | path_join }}" diff --git a/roles/sriov_cni_install/tasks/main.yml b/roles/sriov_cni_install/tasks/main.yml index a3c6d1c0..121c3add 100644 --- a/roles/sriov_cni_install/tasks/main.yml +++ b/roles/sriov_cni_install/tasks/main.yml @@ -21,7 +21,7 @@ # WA till upstream fix reach official version - issue https://github.com/k8snetworkplumbingwg/sriov-cni/issues/241 - name: WA till upstream fix reach official version for issue 241 - use commit id set_fact: - sriov_cni_version: "c1faa0805c92be6e5c629e9caf481f43cfee866c" + sriov_cni_version: "75f2f5e1d06390d680e2f639213a58cf91915734" - name: clone sriov-cni repository git: diff --git a/roles/sriov_dp_install/tasks/main.yml b/roles/sriov_dp_install/tasks/main.yml index 682bb37a..d4dc9aac 100644 --- a/roles/sriov_dp_install/tasks/main.yml +++ b/roles/sriov_dp_install/tasks/main.yml @@ -59,65 +59,42 @@ retries: 5 until: sriov_dp_image_build is success -- name: tag SRIOV Network Device Plugin image - command: docker tag {{ sriov_net_dp_image }}:latest {{ registry_local_address }}/sriov-device-plugin:{{ sriov_net_dp_tag }} +- name: Tag and push SRIOV Network Device Plugin image to local registry + community.docker.docker_image: + name: "{{ sriov_net_dp_image }}:latest" + repository: "{{ registry_local_address }}/sriov-device-plugin:{{ sriov_net_dp_tag }}" + push: true + source: local when: - inventory_hostname == groups['kube_node'][0] - sriov_net_dp_build_image_locally - container_runtime == "docker" -- name: push SRIOV Network Device Plugin image to local registry - command: docker push {{ registry_local_address }}/sriov-device-plugin:{{ sriov_net_dp_tag }} +- name: Set proxy args if defined + ansible.builtin.set_fact: + podman_proxy_args: + http_proxy: "{{ http_proxy | default(omit) }}" + https_proxy: "{{ https_proxy | default(omit) }}" when: - - inventory_hostname == groups['kube_node'][0] - - sriov_net_dp_build_image_locally - - container_runtime == "docker" - -- name: build and tag SRIOV Network Device Plugin image behind proxy - shell: >- - podman build -t {{ registry_local_address }}/{{ item.name }}:{{ sriov_net_dp_tag }} - --build-arg=http_proxy=${http_proxy} - --build-arg=https_proxy=${https_proxy} - -f {{ item.file }} . - args: - chdir: "{{ sriov_net_dp_dir }}" - changed_when: true - with_items: - - {file: images/Dockerfile, name: sriov-device-plugin} - register: sriov_dp_image_build - retries: 5 - until: sriov_dp_image_build is success - when: - - inventory_hostname == groups['kube_node'][0] - - sriov_net_dp_build_image_locally - - '"docker" not in container_runtime' - - http_proxy is defined - - https_proxy is defined + - http_proxy is defined or https_proxy is defined -- name: build and tag SRIOV Network Device Plugin image - command: podman build -t {{ registry_local_address }}/{{ item.name }}:{{ sriov_net_dp_tag }} -f {{ item.file }} . - args: - chdir: "{{ sriov_net_dp_dir }}" - changed_when: true - with_items: - - {file: images/Dockerfile, name: sriov-device-plugin} +- name: Build and push SRIOV Network Device Plugin image - podman + containers.podman.podman_image: + name: "{{ registry_local_address }}/sriov-device-plugin" + tag: "{{ sriov_net_dp_tag }}" + path: "{{ sriov_net_dp_dir }}" + build: + file: images/Dockerfile + extra_args: "{{ podman_proxy_args | default({}) }}" + push: true + state: build register: sriov_dp_image_build retries: 5 until: sriov_dp_image_build is success when: - inventory_hostname == groups['kube_node'][0] - sriov_net_dp_build_image_locally - - '"docker" not in container_runtime' - - http_proxy is not defined - - https_proxy is not defined - -- name: push SRIOV Network Device Plugin image to local registry - command: podman push {{ registry_local_address }}/sriov-device-plugin:{{ sriov_net_dp_tag }} - changed_when: true - when: - - inventory_hostname == groups['kube_node'][0] - - sriov_net_dp_build_image_locally - - '"docker" not in container_runtime' + - container_runtime in ['containerd', 'crio'] - name: create Helm charts directory if needed file: @@ -152,9 +129,11 @@ - inventory_hostname == groups['kube_control_plane'][0] - name: install SRIOV Network Device Plugin helm chart - command: >- - helm upgrade -i sriov-net-dp - -f {{ (project_root_dir, 'charts', 'sriov-net-dp-values.yml') | path_join }} - {{ (project_root_dir, 'charts', 'sriov-net-dp') | path_join }} + kubernetes.core.helm: + chart_ref: "{{ (project_root_dir, 'charts', 'sriov-net-dp') | path_join }}" + release_name: sriov-net-dp + release_namespace: kube-system + values_files: "{{ (project_root_dir, 'charts', 'sriov-net-dp-values.yml') | path_join }}" + force: true when: - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/sriov_network_operator_install/templates/values.yml.j2 b/roles/sriov_network_operator_install/templates/values.yml.j2 index 4e29b04a..1b12e700 100644 --- a/roles/sriov_network_operator_install/templates/values.yml.j2 +++ b/roles/sriov_network_operator_install/templates/values.yml.j2 @@ -1,8 +1,5 @@ operator: tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Exists" - effect: "NoSchedule" - key: "node-role.kubernetes.io/control-plane" operator: "Exists" effect: "NoSchedule" @@ -11,10 +8,6 @@ operator: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: "node-role.kubernetes.io/master" - operator: In - values: [ "" ] - matchExpressions: - key: "node-role.kubernetes.io/control-plane" operator: In diff --git a/roles/sriov_nic_init/tasks/bind_vf_driver.yml b/roles/sriov_nic_init/tasks/bind_vf_driver.yml index 621d4994..9c72355b 100644 --- a/roles/sriov_nic_init/tasks/bind_vf_driver.yml +++ b/roles/sriov_nic_init/tasks/bind_vf_driver.yml @@ -57,15 +57,33 @@ # get a list of VFs PCI addresses and save the configuration - name: attach VFs driver block: - - name: fetch VFs pci addresses for a PF - shell: "for vf in /sys/class/net/{{ item.name }}/device/virtfn*;do basename $(readlink -f $vf);done | sort" - register: vf_pciids - args: - executable: /bin/bash - changed_when: false + - name: Fetch VFs device files + ansible.builtin.find: + paths: "/sys/class/net/{{ item.name }}/device/" + file_type: "link" + patterns: "virtfn*" + use_regex: true + recurse: false + register: vf_pciids_path - - name: save VF driver binding - lineinfile: + - name: Fetch VFs device IDs + ansible.builtin.stat: + path: "{{ device_path }}" + register: vf_pciids_stats + loop: "{{ vf_pciids_path.files | map(attribute='path') | list | sort }}" + loop_control: + loop_var: device_path + when: + - vf_pciids_path.matched > 0 + + - name: Get VFs pci addresses for a PF + ansible.builtin.set_fact: + vf_pciids: "{{ vf_pciids_stats.results | map(attribute='stat') | map(attribute='lnk_target') | map('basename') | list | sort }}" + when: + - vf_pciids_path.matched > 0 + + - name: Save VF driver binding + ansible.builtin.lineinfile: path: "{{ sriov_config_path }}/cek_interfaces_{{ item.name }}" line: "{{ this_item[0] }} {{ this_item[1].value }}" regexp: "^{{ this_item[0] }}" @@ -74,10 +92,9 @@ group: root mode: '0600' become: yes - loop: "{{ vf_pciids.stdout_lines | zip(vfs_acc | dict2items) | list }}" + loop: "{{ vf_pciids | zip(vfs_acc | dict2items) | list }}" loop_control: loop_var: this_item when: - - vf_pciids.stderr|length == 0 - - vf_pciids.stdout_lines|length > 0 + - vf_pciids_path.matched > 0 when: shell_result.results | sum(attribute='rc') == 0 diff --git a/roles/sriov_nic_init/tasks/create_vfs.yml b/roles/sriov_nic_init/tasks/create_vfs.yml index b5c177bb..46bbbedc 100644 --- a/roles/sriov_nic_init/tasks/create_vfs.yml +++ b/roles/sriov_nic_init/tasks/create_vfs.yml @@ -33,11 +33,15 @@ block: # in case when SR-IOV VFs have been already configured we reset it first to avoid "device or resource busy" error - name: reset SR-IOV Virtual Functions - shell: echo 0 > /sys/class/net/{{ item.name }}/device/sriov_numvfs + ansible.builtin.shell: + cmd: echo 0 > /sys/class/net/{{ item.name }}/device/sriov_numvfs + changed_when: true when: existing_vfs.stdout|int != 0 and existing_vfs.stdout|int != item.sriov_numvfs - name: enable SR-IOV Virtual Functions - shell: echo {{ item.sriov_numvfs }} > /sys/class/net/{{ item.name }}/device/sriov_numvfs + ansible.builtin.shell: + cmd: echo {{ item.sriov_numvfs }} > /sys/class/net/{{ item.name }}/device/sriov_numvfs + changed_when: true when: existing_vfs.stdout|int != item.sriov_numvfs - name: force driver binding when VFs are created diff --git a/roles/sriov_shared_versions/defaults/main.yml b/roles/sriov_shared_versions/defaults/main.yml index c783f0ab..405e2426 100644 --- a/roles/sriov_shared_versions/defaults/main.yml +++ b/roles/sriov_shared_versions/defaults/main.yml @@ -15,9 +15,9 @@ ## --- sriov_net_dp_image: "ghcr.io/k8snetworkplumbingwg/sriov-network-device-plugin" -sriov_net_dp_tag: "v3.5.1" +sriov_net_dp_tag: "v3.6.2" sriov_cni_url: "https://github.com/k8snetworkplumbingwg/sriov-cni.git" sriov_cni_version: "v2.7.0" # Once new version is released, remove version WA from roles/sriov_cni_install/tasks/main.yml -# sriov_cni_version: "c1faa0805c92be6e5c629e9caf481f43cfee866c" +# sriov_cni_version: "75f2f5e1d06390d680e2f639213a58cf91915734" diff --git a/roles/tadk_install/defaults/main.yml b/roles/tadk_install/defaults/main.yml index b02a0373..af94ae66 100644 --- a/roles/tadk_install/defaults/main.yml +++ b/roles/tadk_install/defaults/main.yml @@ -21,7 +21,7 @@ image_name: tadk-waf tadk_version: "v23.03.0" container_port: 8005 -service_type: NodePort +service_type: ClusterIP deploy_name: tadk-intel diff --git a/roles/tadk_install/tasks/main.yml b/roles/tadk_install/tasks/main.yml index c9bd2082..eaf83f25 100644 --- a/roles/tadk_install/tasks/main.yml +++ b/roles/tadk_install/tasks/main.yml @@ -32,8 +32,8 @@ when: - inventory_hostname == groups['kube_control_plane'][0] -- name: populate template files with values - template: +- name: Populate template files with values + ansible.builtin.template: src: "{{ item }}.j2" dest: "{{ (dest_path, 'tadk', item) | path_join }}" force: yes @@ -44,11 +44,13 @@ when: - inventory_hostname == groups['kube_control_plane'][0] -- name: deploy the helm - ansible.builtin.command: >- - helm upgrade -i {{ deploy_name }} --create-namespace --namespace {{ tadk_namespace }} - -f {{ (dest_path, 'tadk', 'values.yaml') | path_join }} {{ (dest_path, 'tadk') | path_join }} - args: - chdir: "{{ (dest_path, 'tadk') | path_join }}" +- name: Deploy tadk helm chart + kubernetes.core.helm: + chart_ref: "{{ (dest_path, 'tadk') | path_join }}" + release_name: "{{ deploy_name }}" + release_namespace: "{{ tadk_namespace }}" + values_files: "{{ (dest_path, 'tadk', 'values.yaml') | path_join }}" + create_namespace: true + force: true when: - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/telegraf_install/defaults/main.yml b/roles/telegraf_install/defaults/main.yml index 1cbd1db3..08da225b 100644 --- a/roles/telegraf_install/defaults/main.yml +++ b/roles/telegraf_install/defaults/main.yml @@ -50,6 +50,8 @@ telegraf_config_profiles: - *basic build_your_own: &build_your_own - *basic + base_video_analytics: + - *basic on_prem_vss: - *basic on_prem_sw_defined_factory: @@ -62,7 +64,7 @@ telegraf_chart_path: "{{ (project_root_dir, 'charts', 'telegraf') | path_join }} telegraf_root_path: "{{ (project_root_dir, 'telegraf') | path_join }}" telegraf_helm_values_file: "{{ telegraf_chart_path }}/values.yaml" -telegraf_scrap_interval: 30 +telegraf_scrape_interval: 30 telegraf_prometheus_metrics_endpoint_port: 9273 telegraf_image_name: "docker.io/intel/observability-telegraf" diff --git a/roles/telegraf_install/tasks/preflight.yml b/roles/telegraf_install/tasks/preflight.yml index 4db76792..3a6e8c2b 100644 --- a/roles/telegraf_install/tasks/preflight.yml +++ b/roles/telegraf_install/tasks/preflight.yml @@ -20,3 +20,10 @@ msg: - Deployment profile '{{ profile_name }}' has no telegraf configuration defined. - Please define telegraf configuration for the current profile in {{ role_name }} role defaults. + +- name: Check that Collectd is disabled + ansible.builtin.assert: + that: + - not collectd_enabled | default(false) + fail_msg: | + When Telegraf is enabled then Collectd must be disabled. diff --git a/roles/telegraf_install/templates/telegraf_plugins_conf.yml.j2 b/roles/telegraf_install/templates/telegraf_plugins_conf.yml.j2 index 5789c9be..e2bf42cc 100644 --- a/roles/telegraf_install/templates/telegraf_plugins_conf.yml.j2 +++ b/roles/telegraf_install/templates/telegraf_plugins_conf.yml.j2 @@ -1,6 +1,6 @@ agent: | [agent] - interval = "5s" + interval = "{{ telegraf_scrape_interval }}s" round_interval = true metric_batch_size = 10000 metric_buffer_limit = 100000 diff --git a/roles/telegraf_install/templates/values.yaml.j2 b/roles/telegraf_install/templates/values.yaml.j2 index c821aaeb..d9a4cf1e 100644 --- a/roles/telegraf_install/templates/values.yaml.j2 +++ b/roles/telegraf_install/templates/values.yaml.j2 @@ -2,8 +2,6 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. -# general settings -scrapInterval: {{ telegraf_scrap_interval }} # port of the prometheus_output plugin to expose prometheusMetricsEndpointPort: {{ telegraf_prometheus_metrics_endpoint_port }} diff --git a/roles/userspace_cni_install/defaults/main.yml b/roles/userspace_cni_install/defaults/main.yml index 0276b8d1..befece9a 100644 --- a/roles/userspace_cni_install/defaults/main.yml +++ b/roles/userspace_cni_install/defaults/main.yml @@ -16,13 +16,22 @@ --- userspace_cni_git_url: "https://github.com/intel/userspace-cni-network-plugin.git" userspace_cni_version: "v1.3" - -vpp_version: "2302" +userspace_cni_path: "{{ (project_root_dir, 'userspace_cni') | path_join }}" ovs_dir: "{{ (project_root_dir, 'ovs') | path_join }}" ovs_repo: https://github.com/openvswitch/ovs.git +default_ovs_version: "v3.2.1" +ovs_version: "{{ userspace_cni.ovs_version | default(default_ovs_version) }}" dpdk_dir: "{{ (project_root_dir, 'dpdk-' + dpdk_version) | path_join }}" dpdk_build: '{{ dpdk_dir }}/x86_64-native-linuxapp-gcc' vpp_dir: "{{ (project_root_dir, 'vpp') | path_join }}" +vpp_version: "2310" + +buildtool: >- + {%- if container_runtime == 'docker' -%} + docker + {%- else -%} + podman + {%- endif -%} diff --git a/roles/userspace_cni_install/files/ovs-dpdk b/roles/userspace_cni_install/files/ovs-dpdk new file mode 100644 index 00000000..011647da --- /dev/null +++ b/roles/userspace_cni_install/files/ovs-dpdk @@ -0,0 +1,14 @@ +#!/bin/bash + +case $1 in + Restart|restart) + /usr/local/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd restart + /usr/local/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --db-sock="/usr/local/var/run/openvswitch/db.sock" restart + ;; + Stop|stop) + /usr/local/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --db-sock="/usr/local/var/run/openvswitch/db.sock" stop + /usr/local/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd stop + ;; + *) + echo "Incorrect parameter" +esac diff --git a/roles/userspace_cni_install/files/ovs-dpdk.service b/roles/userspace_cni_install/files/ovs-dpdk.service new file mode 100644 index 00000000..8e1fe286 --- /dev/null +++ b/roles/userspace_cni_install/files/ovs-dpdk.service @@ -0,0 +1,12 @@ +[Unit] +Description=RA OVS-DPDK Daemon Service +DefaultDependencies=no + +[Service] +Type=oneshot +RemainAfterExit=true +ExecStart=/opt/cek/ovs-dpdk restart +ExecStop=/opt/cek/ovs-dpdk stop + +[Install] +WantedBy=multi-user.target diff --git a/roles/userspace_cni_install/files/ovs-reboot.service b/roles/userspace_cni_install/files/ovs-reboot.service new file mode 100644 index 00000000..e9471429 --- /dev/null +++ b/roles/userspace_cni_install/files/ovs-reboot.service @@ -0,0 +1,12 @@ +[Unit] +Description=Kill the OVS process to unlock the reboot +Before=shutdown.target reboot.target halt.target +DefaultDependencies=no + +[Service] +Type=oneshot +RemainAfterExit=true +ExecStart=/opt/cek/ovs-dpdk stop + +[Install] +WantedBy=shutdown.target reboot.target halt.target \ No newline at end of file diff --git a/roles/userspace_cni_install/tasks/main.yml b/roles/userspace_cni_install/tasks/main.yml index 38cd6f5d..5bc6c61f 100644 --- a/roles/userspace_cni_install/tasks/main.yml +++ b/roles/userspace_cni_install/tasks/main.yml @@ -14,34 +14,21 @@ ## limitations under the License. ## --- -- name: install dependencies - include_role: +- name: Install dependencies + ansible.builtin.include_role: name: install_dependencies -- name: determine whether VPP can be installed on the target - set_fact: - vpp_supported: false - when: ansible_os_family == 'RedHat' and ansible_distribution_version >= '8' - -- name: install OVS-DPDK - include_tasks: ovs_install.yml - when: ovs_dpdk_enabled | default(false) - -- name: install userspace cni if supported - include_tasks: userspace_cni_install.yml +- name: Install OVS-DPDK + ansible.builtin.include_tasks: ovs_install.yml when: - - userspace_cni_enabled - - ovs_dpdk_enabled or vpp_supported + - userspace_cni.vswitch | default(false) == 'ovs' -- name: warn if VPP is enabled but isn't supported - debug: - msg: VPP enabled in the config, but cannot be installed because the OS isn't supported. Continuing without VPP support. +- name: Install VPP + ansible.builtin.include_tasks: vpp_install.yml when: - - vpp_enabled | default(false) - - not vpp_supported + - userspace_cni.vswitch | default(false) == 'vpp' -- name: install VPP - include_tasks: vpp_install.yml +- name: Install Userspace CNI + ansible.builtin.include_tasks: userspace_cni_install_old.yml when: - - vpp_enabled | default(false) - - vpp_supported | default(false) + - userspace_cni_enabled | default(false) diff --git a/roles/userspace_cni_install/tasks/ovs_install.yml b/roles/userspace_cni_install/tasks/ovs_install.yml index a7cec781..d9cf7163 100644 --- a/roles/userspace_cni_install/tasks/ovs_install.yml +++ b/roles/userspace_cni_install/tasks/ovs_install.yml @@ -14,28 +14,24 @@ ## limitations under the License. ## --- -- name: assert that hugepages are enabled - assert: - that: hugepages_enabled | default(false) - fail_msg: "Hugepages are disabled. Please configure hugepages in the host vars or disable OVS-DPDK installation." - -- name: clone OVS git repository - git: +- name: Clone OVS git repository + ansible.builtin.git: repo: '{{ ovs_repo }}' dest: '{{ ovs_dir }}' - version: '{{ ovs_version | default("master") }}' + version: '{{ ovs_version }}' force: yes register: ovs_changed -- name: check whether bootstrap is required - stat: path={{ ovs_dir }}/configure +- name: Check whether bootstrap is required + ansible.builtin.stat: + path: "{{ ovs_dir }}/configure" register: ovs_config_status - name: bootstrap OVS -# noqa no-handler - more than one condition, can't be a handler command: ./boot.sh args: chdir: "{{ ovs_dir }}" + changed_when: true when: not ovs_config_status.stat.exists or (ovs_rebuild is defined) or ovs_changed.changed - name: Check if OVS Makefile exists @@ -50,11 +46,11 @@ - name: install dpdk-devel to prepare OVS-DPDK build in RHEL / Rocky >= 8.2 dnf: name: dpdk-devel - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.2' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.2', '>=') - name: update the dynamic linker cache in RHEL / Rocky >= 8.2 command: "ldconfig" - when: ansible_os_family == "RedHat" and ansible_distribution_version >= '8.2' + when: ansible_os_family == "RedHat" and ansible_distribution_version is version('8.2', '>=') changed_when: true - name: prepare OVS-DPDK build in RHEL / Rocky >= 8.2 @@ -63,8 +59,9 @@ chdir: "{{ ovs_dir }}" environment: PKG_CONFIG_PATH: "/usr/local/lib64/pkgconfig:/usr/share/pkgconfig:/usr/lib64/pkgconfig" + changed_when: true when: - - ansible_os_family == "RedHat" and ansible_distribution_version >= '8.2' + - ansible_os_family == "RedHat" and ansible_distribution_version is version('8.2', '>=') - not ovs_makefile_status.stat.exists or (ovs_rebuild is defined) or ovs_changed.changed - name: prepare OVS-DPDK build @@ -74,6 +71,7 @@ chdir: "{{ ovs_dir }}" environment: PKG_CONFIG_PATH: "/usr/local/lib64/pkgconfig:/usr/share/pkgconfig:/usr/lib64/pkgconfig" + changed_when: true when: - ansible_distribution == 'Ubuntu' - not ovs_makefile_status.stat.exists or (ovs_rebuild is defined) or ovs_changed.changed @@ -120,6 +118,7 @@ - name: create database configuration command: '/usr/local/bin/ovsdb-tool create /usr/local/etc/openvswitch/conf.db /usr/local/share/openvswitch/vswitch.ovsschema' + changed_when: true when: not ovs_dbconfig_status.stat.exists - name: start OVS database server @@ -129,7 +128,7 @@ changed_when: true - name: set OVS dpdk-socket-mem - command: '/usr/local/bin/ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-socket-mem="{{ ovs_dpdk_socket_mem }}"' + command: '/usr/local/bin/ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-socket-mem="{{ userspace_cni.ovs_dpdk_socket_mem }}"' changed_when: true - name: set OVS dpdk-init @@ -141,7 +140,7 @@ changed_when: true - name: set OVS dpdk-lcore-mask - command: '/usr/local/bin/ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-lcore-mask={{ ovs_dpdk_lcore_mask }}' + command: '/usr/local/bin/ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-lcore-mask={{ userspace_cni.ovs_dpdk_lcore_mask }}' changed_when: true - name: WA for bug in DPDK initial device scan - block qat devices @@ -233,11 +232,48 @@ when: - qat_devices is defined and (qat_devices|length>0) -- name: start OVS vswitchd - command: /usr/local/share/openvswitch/scripts/ovs-ctl --no-ovsdb-server --db-sock="/usr/local/var/run/openvswitch/db.sock" restart - environment: - OVS_RUNDIR: /usr/local/var/run/openvswitch - changed_when: true +- name: copy ovs-dpdk script to /opt/cek/ + ansible.builtin.copy: + src: "ovs-dpdk" + dest: /opt/cek/ovs-dpdk + owner: root + group: root + mode: '0700' + become: yes + +- name: copy ovs-dpdk.service script to /lib/systemd/ folder + ansible.builtin.copy: + src: "ovs-dpdk.service" + dest: /lib/systemd/system/ovs-dpdk.service + owner: root + group: root + mode: '0644' + become: yes + +- name: copy ovs-reboot.service script to /lib/systemd/ folder + ansible.builtin.copy: + src: "ovs-reboot.service" + dest: /lib/systemd/system/ovs-reboot.service + owner: root + group: root + mode: '0644' + become: yes + +- name: ensure that ovs-dpdk service is enabled and restarted + ansible.builtin.systemd: + name: ovs-dpdk + state: started + enabled: yes + daemon_reload: yes + become: yes + +- name: ensure that ovs-reboot service is enabled and restarted + ansible.builtin.systemd: + name: ovs-reboot + state: stopped + enabled: yes + daemon_reload: yes + become: yes - name: create OVS bridge command: /usr/local/bin/ovs-vsctl --may-exist add-br br0 -- set bridge br0 datapath_type=netdev diff --git a/roles/userspace_cni_install/tasks/preflight.yml b/roles/userspace_cni_install/tasks/preflight.yml new file mode 100644 index 00000000..346687c3 --- /dev/null +++ b/roles/userspace_cni_install/tasks/preflight.yml @@ -0,0 +1,110 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: Check vswitch definition + ansible.builtin.assert: + that: + - userspace_cni.vswitch is defined + - userspace_cni.vswitch in ['ovs', 'vpp'] + fail_msg: + - "When Userspace CNI is enabled, userspace_cni.vswitch must be defined in host_vars for each node." + - "Supported vswitch values are: vpp, ovs" + - "Node {{ inventory_hostname }} has incorrect configuration: {{ userspace_cni.vswitch | default('undefined') }}." + +# VPP part +- name: Check VPP configuration + when: userspace_cni.vswitch | default(false) == 'vpp' + block: + # W/A Disabled until userspace CNI compilation is fixed + - name: check OS for VPP compilation + ansible.builtin.fail: + msg: "VPP is temporarily not supported." + + - name: VPP | Check example network attachment definitions + ansible.builtin.assert: + that: + - example_net_attach_defs['userspace_vpp'] | default('false') + fail_msg: "When VPP is enabled, example_net_attach_defs.userspace_vpp must be enabled as well." + + - name: VPP | Check hugepages + ansible.builtin.assert: + that: + - hugepages_enabled | default(false) + - default_hugepage_size == "2M" + - number_of_hugepages_2M >= 0 + fail_msg: + - When VPP is enabled, hugepages must be enabled and default_hugepage_size must be set to 2M according to host_vars example. + - Please correct the configuration + + - name: VPP | Check OS for VPP compilation + ansible.builtin.assert: + that: + - (ansible_distribution == 'Ubuntu' and ansible_distribution_version is version('22.04', '>=')) + fail_msg: "Unsupported configuration. VPP can be only enabled on Ubuntu >= 22.04" + + +# OVS DPDK part +- name: Check OVS configuration + when: userspace_cni.vswitch | default(false) == 'ovs' + block: + - name: OVS | Check DPDK enabled + ansible.builtin.assert: + that: + - install_dpdk | default(false) + - dpdk_version is defined + fail_msg: + - When OVS is enabled, DPDK must be enabled as well. + - Please set 'dpdk_install' to 'true' in host_vars and set dpdk_version value. + + - name: OVS | Warn if default version selected + ansible.builtin.debug: + msg: "OVS version not defined in host vars! Defaulting to version {{ ovs_version }}." + when: userspace_cni.ovs_version is not defined + + # Refer https://docs.openvswitch.org/en/latest/faq/releases/ to get OVS DPDK compatibility + - name: OVS | Check DPDK compatibility + ansible.builtin.assert: + that: + ovs_version == 'v3.2.1' and dpdk_version == '22.11.1' + or ovs_version == 'v3.2.0' and dpdk_version == '22.11.1' + or ovs_version == 'v3.1.1' and dpdk_version == '22.11.1' + or ovs_version == 'v3.0.1' and dpdk_version == '21.11.2' + or (ovs_version >= 'v2.17.0' and ovs_version <= 'v3.0.3') and (dpdk_version >= '21.11' and dpdk_version <= '22.07') + or (ovs_version < 'v2.16.2' and ovs_version >= 'v2.16.0') and dpdk_version == '21.08' + or ovs_version == 'v2.15.0' and dpdk_version == '20.11' + or ovs_version == 'v2.14.2' and dpdk_version == '19.11.6' + or ovs_version == 'v2.14.1' and dpdk_version == '19.11.6' + or ovs_version == 'v2.14.0' and dpdk_version == '19.11.6' + or ovs_version == 'v2.13.3' and dpdk_version == '19.11.6' + or ovs_version == 'v2.13.2' and dpdk_version == '19.11.6' + or ovs_version == 'v2.13.1' and dpdk_version == '19.11.6' + or ovs_version == 'v2.13.0' and dpdk_version == '19.11.6' + fail_msg: "OVS {{ ovs_version }} does not build with DPDK version {{ dpdk_version }}. Please correct the host_vars configuration" + + - name: OVS | Check example network attachment definitions + ansible.builtin.assert: + that: + - example_net_attach_defs['userspace_ovs_dpdk'] | default('false') + fail_msg: "When OVS DPDK is enabled, example_net_attach_defs.userspace_ovs_dpdk must be enabled as well." + + - name: OVS | Check hugepages + ansible.builtin.assert: + that: + - hugepages_enabled | default(false) + - default_hugepage_size == "1G" + - number_of_hugepages_1G >= 0 + fail_msg: + - When OVS is enabled, hugepages must be enabled and default_hugepage_size must be set to 1G according to host_vars example. + - Please correct the configuration diff --git a/roles/userspace_cni_install/tasks/userspace_cni_install.yml b/roles/userspace_cni_install/tasks/userspace_cni_install.yml index 05821d69..51787ec4 100644 --- a/roles/userspace_cni_install/tasks/userspace_cni_install.yml +++ b/roles/userspace_cni_install/tasks/userspace_cni_install.yml @@ -14,67 +14,21 @@ ## limitations under the License. ## --- -- name: create /opt/cni/bin - file: - path: "/opt/cni/bin" - state: directory - recurse: yes - mode: 0755 - -- name: set path to the Userspace CNI plugin sources - set_fact: - userspace_cni_path: "{{ ansible_env.HOME }}/go/src/github.com/intel/userspace-cni-network-plugin" - -- name: clone Userspace CNI Plugin repository - git: +- name: Clone Userspace CNI Plugin repository + ansible.builtin.git: repo: "{{ userspace_cni_git_url }}" version: "{{ userspace_cni_version }}" dest: "{{ userspace_cni_path }}" force: true -- name: replace CentOS with Rocky in Makefile - replace: - path: "{{ userspace_cni_path }}/Makefile" - regexp: 'centos' - replace: 'rocky' - mode: 0600 - when: ansible_distribution == "Rocky" - - -# START OF BLOCK W/A to unblock userspace CNI -- name: patch Userspace CNI plugin to build only ovs-dpdk part - ansible.posix.patch: - src: "userspace_cni.patch" - dest: "{{ userspace_cni_path }}/userspace/userspace.go" - -- name: build Userspace CNI plugin without VPP support - shell: > - source /etc/profile.d/golang.sh && - export GO111MODULE=on && - go build -v - args: - chdir: "{{ userspace_cni_path }}/userspace" - executable: /bin/bash - creates: "{{ userspace_cni_path }}/userspace/userspace" - -# - name: build Userspace CNI plugin -# shell: > -# source /etc/profile.d/golang.sh && -# make clean && -# export GO111MODULE=on && -# make install-dep && -# make install && -# make -# args: -# chdir: "{{ userspace_cni_path }}" -# executable: /bin/bash -# creates: "{{ userspace_cni_path }}/userspace/userspace" - -# END OF BLOCK W/A to unblock userspace CNI - -- name: copy built Userspace CNI plugin binary to the CNI bin dir - copy: - remote_src: yes - src: "{{ userspace_cni_path }}/userspace/userspace" - dest: "/opt/cni/bin/userspace" - mode: 0755 +- name: Build, push and deploy Userspace CNI plugin + become: true + environment: + BUILDAH_FORMAT: 'docker' + community.general.make: + target: all + chdir: "{{ userspace_cni_path }}" + params: + IMAGE_BUILDER: "{{ buildtool }}" + IMAGE_REGISTRY: "{{ registry_local_address }}/" + IMAGE_VERSION: v1.4-prerelease # TODO use version reference when release is out diff --git a/roles/userspace_cni_install/tasks/userspace_cni_install_old.yml b/roles/userspace_cni_install/tasks/userspace_cni_install_old.yml new file mode 100644 index 00000000..0b6ae4f6 --- /dev/null +++ b/roles/userspace_cni_install/tasks/userspace_cni_install_old.yml @@ -0,0 +1,64 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +- name: create /opt/cni/bin + ansible.builtin.file: + path: "/opt/cni/bin" + state: directory + recurse: yes + mode: 0755 + +- name: set path to the Userspace CNI plugin sources + ansible.builtin.set_fact: + userspace_cni_path: "{{ ansible_env.HOME }}/go/src/github.com/intel/userspace-cni-network-plugin" + +- name: clone Userspace CNI Plugin repository + ansible.builtin.git: + repo: "{{ userspace_cni_git_url }}" + version: "{{ userspace_cni_version }}" + dest: "{{ userspace_cni_path }}" + force: true + +- name: replace CentOS with Rocky in Makefile + ansible.builtin.replace: + path: "{{ userspace_cni_path }}/Makefile" + regexp: 'centos' + replace: 'rocky' + mode: 0600 + when: ansible_distribution == "Rocky" + +# START OF BLOCK W/A to unblock userspace CNI +- name: patch Userspace CNI plugin to build only ovs-dpdk part + ansible.posix.patch: + src: "userspace_cni.patch" + dest: "{{ userspace_cni_path }}/userspace/userspace.go" + +- name: build Userspace CNI plugin without VPP support + ansible.builtin.shell: > + source /etc/profile.d/golang.sh && + export GO111MODULE=on && + go build -v + args: + chdir: "{{ userspace_cni_path }}/userspace" + executable: /bin/bash + creates: "{{ userspace_cni_path }}/userspace/userspace" +# END OF BLOCK W/A to unblock userspace CNI + +- name: copy built Userspace CNI plugin binary to the CNI bin dir + ansible.builtin.copy: + remote_src: yes + src: "{{ userspace_cni_path }}/userspace/userspace" + dest: "/opt/cni/bin/userspace" + mode: 0755 diff --git a/roles/userspace_cni_install/vars/main.yml b/roles/userspace_cni_install/vars/main.yml index a2cad828..e0a58758 100644 --- a/roles/userspace_cni_install/vars/main.yml +++ b/roles/userspace_cni_install/vars/main.yml @@ -22,23 +22,17 @@ install_dependencies: - build-essential - autoconf - automake + - wget - libtool - shtool - - wget RedHat: - git - - gcc - make - autoconf - automake - - libtool - wget + - gcc + - libtool -ovs_dpdk_lcore_mask: 0x1 ovs_dpdk_pmd_mask: 0x2 -ovs_dpdk_socket_mem: "256,0" ovs_dpdk_extra: "--block=" - -# supported unless proven otherwise -vpp_supported: true -userspace_cni_supported: false diff --git a/roles/vm/auto_configure_nic_devices/tasks/main.yml b/roles/vm/auto_configure_nic_devices/tasks/main.yml new file mode 100644 index 00000000..a4e7bddc --- /dev/null +++ b/roles/vm/auto_configure_nic_devices/tasks/main.yml @@ -0,0 +1,164 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: create new_vms list + ansible.builtin.set_fact: + new_vms: [] + +- name: get available NIC VF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ nic_supported_vf_dev_ids | join('|') }}'" + executable: /bin/bash + register: nic_pci_vf_devices + changed_when: false + failed_when: nic_pci_vf_devices.rc not in [0, 1] + +- name: get NIC device bus IDs + ansible.builtin.set_fact: + nic_vf_devices: "{{ nic_pci_vf_devices.stdout_lines | map('split') | map('first') }}" + +- name: get configured worker and generic nodes + ansible.builtin.set_fact: + worker_nodes: "{{ vms | selectattr('type', 'in', ['work', 'vm']) }}" + +- name: get path for saved NIC PCI config + ansible.builtin.set_fact: + nic_pci_conf_path: "{{ (lookup('env', 'PWD'), '.nic-pci-' + inventory_hostname + '.yml') | path_join }}" + +- name: stat saved NIC PCI config + ansible.builtin.stat: + path: "{{ nic_pci_conf_path }}" + register: nic_pci_config_stat + delegate_to: localhost + +- name: read saved NIC PCI config + ansible.builtin.set_fact: + nic_vms_pci_config: "{{ lookup('file', nic_pci_conf_path) | from_yaml }}" + when: + - not vm_recreate_existing + - nic_pci_config_stat.stat.exists + +- name: get free NIC devices + ansible.builtin.set_fact: + free_nic_pci_devices: "{{ nic_vf_devices | difference(nic_vms_pci_config | map(attribute='nic_pci') | flatten) }}" + when: + - scale | d(false) + - nic_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: get new worker nodes if scaling + ansible.builtin.set_fact: + new_worker_nodes: "{{ worker_nodes | rejectattr('name', 'in', (nic_vms_pci_config | map(attribute='name'))) | list }}" + when: + - scale | d(false) + - nic_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: get new NIC VF devices count + ansible.builtin.set_fact: + new_nic_devices_sum: "{{ new_worker_nodes | map(attribute='nic_devices_count', default=0) | sum }}" + when: + - scale | d(false) + - nic_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: check if we have enough NIC VFs left available + ansible.builtin.assert: + that: (new_nic_devices_sum | int) <= (free_nic_pci_devices | length | int) + fail_msg: + "Not enough free NIC VFs left ({{ free_nic_pci_devices | length }}). New VMs configured to have {{ new_nic_devices_sum }} VFs." + when: + - scale | d(false) + - nic_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: create new worker node pci config + ansible.builtin.set_fact: + nic_vms_pci_config: |- + [ + {% for node in worker_nodes %} + { + "name": "{{ node.name }}", + "nic_pci": {{ + (nic_vf_devices[ + (worker_nodes[0:loop.index0] | map(attribute='nic_devices_count', default=0) | sum): + (node.nic_devices_count | d(0)) + (worker_nodes[0:loop.index0] | map(attribute='nic_devices_count', default=0) | sum) + ]) + }}, + }, + {% endfor %} + ] + when: + - vm_recreate_existing or + not nic_pci_config_stat.stat.exists + +- name: create new worker node pci config if scaling + ansible.builtin.set_fact: + scaled_nic_vms_pci_config: |- + [ + {% for node in new_worker_nodes %} + { + "name": "{{ node.name }}", + "nic_pci": {{ + (free_nic_pci_devices[ + (new_worker_nodes[0:loop.index0] | map(attribute='nic_devices_count', default=0) | sum): + (node.nic_devices_count | d(0)) + (new_worker_nodes[0:loop.index0] | map(attribute='nic_devices_count', default=0) | sum) + ]) + }}, + }, + {% endfor %} + ] + when: + - scale | d(false) + - not vm_recreate_existing + +- name: combine existing nic pci config if scaling + ansible.builtin.set_fact: + nic_vms_pci_config: "{{ nic_vms_pci_config + scaled_nic_vms_pci_config }}" + when: + - scale | d(false) + - not vm_recreate_existing + +- name: save NIC PCI config + ansible.builtin.copy: + content: "{{ nic_vms_pci_config | to_nice_yaml }}" + dest: "{{ nic_pci_conf_path }}" + mode: '0644' + delegate_to: localhost + when: + - vm_recreate_existing or + not nic_pci_config_stat.stat.exists or + scale | d(false) + +- name: combine new_vms list with user defined vms + ansible.builtin.set_fact: + new_vms: "{{ new_vms | d([]) + [item|combine(_selection)] }}" + loop: "{{ vms }}" + vars: + _selection: "{{ nic_vms_pci_config | selectattr('name', '==', item.name) | combine }}" + +- name: combine pci devices for each VM + ansible.builtin.set_fact: + new_vms: |- + [ + {% for vm in new_vms %} + {{ vm | combine({'pci': (vm.pci | d([]) + (vm.nic_pci | d([])) )}) }}, + {% endfor %} + ] + +- name: replace original vms config + ansible.builtin.set_fact: + vms: "{{ new_vms }}" diff --git a/roles/vm/auto_configure_qat_devices/tasks/main.yml b/roles/vm/auto_configure_qat_devices/tasks/main.yml new file mode 100644 index 00000000..e7679eb3 --- /dev/null +++ b/roles/vm/auto_configure_qat_devices/tasks/main.yml @@ -0,0 +1,164 @@ +## +## Copyright (c) 2020-2023 Intel Corporation. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +--- +- name: create new_vms list + ansible.builtin.set_fact: + new_vms: [] + +- name: get available QAT VF devices + ansible.builtin.shell: + cmd: "set -o pipefail && lspci -nnm | egrep -i '{{ qat_supported_vf_dev_ids | join('|') }}'" + executable: /bin/bash + register: qat_pci_vf_devices + changed_when: false + failed_when: qat_pci_vf_devices.rc not in [0, 1] + +- name: get QAT device bus IDs + ansible.builtin.set_fact: + qat_vf_devices: "{{ qat_pci_vf_devices.stdout_lines | map('split') | map('first') }}" + +- name: get configured worker and generic nodes + ansible.builtin.set_fact: + worker_nodes: "{{ vms | selectattr('type', 'in', ['work', 'vm']) }}" + +- name: get path for saved QAT PCI config + ansible.builtin.set_fact: + qat_pci_conf_path: "{{ (lookup('env', 'PWD'), '.qat-pci-' + inventory_hostname + '.yml') | path_join }}" + +- name: stat saved QAT PCI config + ansible.builtin.stat: + path: "{{ qat_pci_conf_path }}" + register: qat_pci_config_stat + delegate_to: localhost + +- name: read saved QAT PCI config + ansible.builtin.set_fact: + qat_vms_pci_config: "{{ lookup('file', qat_pci_conf_path) | from_yaml }}" + when: + - not vm_recreate_existing + - qat_pci_config_stat.stat.exists + +- name: get free QAT devices + ansible.builtin.set_fact: + free_qat_pci_devices: "{{ qat_vf_devices | difference(qat_vms_pci_config | map(attribute='qat_pci') | flatten) }}" + when: + - scale | d(false) + - qat_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: get new worker nodes if scaling + ansible.builtin.set_fact: + new_worker_nodes: "{{ worker_nodes | rejectattr('name', 'in', (qat_vms_pci_config | map(attribute='name'))) | list }}" + when: + - scale | d(false) + - qat_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: get new QAT VF devices count + ansible.builtin.set_fact: + new_qat_devices_sum: "{{ new_worker_nodes | map(attribute='qat_devices_count', default=0) | sum }}" + when: + - scale | d(false) + - qat_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: check if we have enough QAT VFs left available + ansible.builtin.assert: + that: (new_qat_devices_sum | int) <= (free_qat_pci_devices | length | int) + fail_msg: + "Not enough free QAT VFs devices left ({{ free_qat_pci_devices | length }}). New VMs configured to have {{ new_qat_devices_sum }} VFs." + when: + - scale | d(false) + - qat_pci_config_stat.stat.exists + - not vm_recreate_existing + +- name: create new worker node pci config + ansible.builtin.set_fact: + qat_vms_pci_config: |- + [ + {% for node in worker_nodes %} + { + "name": "{{ node.name }}", + "qat_pci": {{ + (qat_vf_devices[ + (worker_nodes[0:loop.index0] | map(attribute='qat_devices_count', default=0) | sum): + (node.qat_devices_count | d(0)) + (worker_nodes[0:loop.index0] | map(attribute='qat_devices_count', default=0) | sum) + ]) + }}, + }, + {% endfor %} + ] + when: + - vm_recreate_existing or + not qat_pci_config_stat.stat.exists + +- name: create new worker node pci config if scaling + ansible.builtin.set_fact: + scaled_qat_vms_pci_config: |- + [ + {% for node in new_worker_nodes %} + { + "name": "{{ node.name }}", + "qat_pci": {{ + free_qat_pci_devices[ + (new_worker_nodes[0:loop.index0] | map(attribute='qat_devices_count', default=0) | sum): + (node.qat_devices_count | d(0)) + (new_worker_nodes[0:loop.index0] | map(attribute='qat_devices_count', default=0) | sum) + ] + }}, + }, + {% endfor %} + ] + when: + - scale | d(false) + - not vm_recreate_existing + +- name: combine existing qat pci config if scaling + ansible.builtin.set_fact: + qat_vms_pci_config: "{{ qat_vms_pci_config + scaled_qat_vms_pci_config }}" + when: + - scale | d(false) + - not vm_recreate_existing + +- name: save QAT PCI config + ansible.builtin.copy: + content: "{{ qat_vms_pci_config | to_nice_yaml }}" + dest: "{{ qat_pci_conf_path }}" + mode: '0644' + delegate_to: localhost + when: + - vm_recreate_existing or + not qat_pci_config_stat.stat.exists or + scale | d(false) + +- name: combine new_vms list with user defined vms + ansible.builtin.set_fact: + new_vms: "{{ new_vms | d([]) + [item|combine(_selection)] }}" + loop: "{{ vms }}" + vars: + _selection: "{{ qat_vms_pci_config | selectattr('name', '==', item.name) | combine }}" + +- name: combine pci devices for each VM + ansible.builtin.set_fact: + new_vms: |- + [ + {% for vm in new_vms %} + {{ vm | combine({'pci': (vm.pci | d([]) + (vm.qat_pci | d([])) )}) }}, + {% endfor %} + ] + +- name: replace original vms config + ansible.builtin.set_fact: + vms: "{{ new_vms }}" diff --git a/roles/vm/compile_libvirt/defaults/main.yml b/roles/vm/compile_libvirt/defaults/main.yml index d2bdc652..24f59df2 100644 --- a/roles/vm/compile_libvirt/defaults/main.yml +++ b/roles/vm/compile_libvirt/defaults/main.yml @@ -17,4 +17,4 @@ libvirt_groups: - libvirt - libvirtd - libvirt-qemu -libvirt_tag: 9.3.0 +libvirt_tag: 9.9.0 diff --git a/roles/vm/compile_libvirt/files/x86_features.xml b/roles/vm/compile_libvirt/files/x86_features.xml deleted file mode 100644 index 7b57df18..00000000 --- a/roles/vm/compile_libvirt/files/x86_features.xml +++ /dev/null @@ -1,619 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/roles/vm/compile_libvirt/tasks/compile_libvirt.yml b/roles/vm/compile_libvirt/tasks/compile_libvirt.yml index 841153b2..cc7f7932 100644 --- a/roles/vm/compile_libvirt/tasks/compile_libvirt.yml +++ b/roles/vm/compile_libvirt/tasks/compile_libvirt.yml @@ -29,7 +29,7 @@ include_role: name: install_dependencies -- name: Clone libvirt fork with sgx support +- name: Clone libvirt - with sgx support git: repo: 'https://github.com/libvirt/libvirt.git' dest: "{{ (project_root_dir, 'libvirt') | path_join }}" @@ -41,7 +41,7 @@ enabled: no state: stopped -- name: Adding libvrit group +- name: Adding libvirt group group: name: "{{ item }}" state: present @@ -72,12 +72,6 @@ dest: /etc/libvirt/qemu.conf mode: '0644' -- name: Copy x86_features.xml file - copy: - src: x86_features.xml - dest: /usr/share/libvirt/cpu_map/x86_features.xml - mode: '0644' - - name: Enabling libvirt systemd: name: libvirtd diff --git a/roles/vm/manage_vms/tasks/main.yml b/roles/vm/manage_vms/tasks/main.yml index b40dc842..6acc71cb 100644 --- a/roles/vm/manage_vms/tasks/main.yml +++ b/roles/vm/manage_vms/tasks/main.yml @@ -18,7 +18,7 @@ include_role: name: bootstrap/allocate_cpus # noqa role-name[path] - role in bootstrap when: - - isolcpus_cpus_total is not defined and not isolcpus_cpus_total + - (isolcpus_cpus_total is not defined) or (not isolcpus_cpus_total) - name: Start VMs include_tasks: start_vm.yml diff --git a/roles/vm/manage_vms/tasks/optimize.yml b/roles/vm/manage_vms/tasks/optimize.yml index ecd81153..438f6470 100644 --- a/roles/vm/manage_vms/tasks/optimize.yml +++ b/roles/vm/manage_vms/tasks/optimize.yml @@ -21,4 +21,5 @@ numa: "{{ vm.numa }}" alloc_all: "{{ vm.alloc_all }}" pinning: true + host_name: "{{ hostvars[inventory_hostname]['ansible_hostname'] }}" changed_when: true diff --git a/roles/vm/manage_vms/tasks/start_vm.yml b/roles/vm/manage_vms/tasks/start_vm.yml index 5e989fac..467e1bdb 100644 --- a/roles/vm/manage_vms/tasks/start_vm.yml +++ b/roles/vm/manage_vms/tasks/start_vm.yml @@ -107,7 +107,7 @@ - name: copy OVMF.fd to ovmf folder ansible.builtin.copy: - src: /usr/share/tdvf/OVMF.fd + src: /usr/share/qemu/OVMF.fd dest: /usr/share/ovmf/OVMF.fd remote_src: yes mode: '0644' diff --git a/roles/vm/prepare_bm_host_config_vxlan/tasks/main.yml b/roles/vm/prepare_bm_host_config_vxlan/tasks/main.yml index ff640230..ad94f7a2 100644 --- a/roles/vm/prepare_bm_host_config_vxlan/tasks/main.yml +++ b/roles/vm/prepare_bm_host_config_vxlan/tasks/main.yml @@ -109,3 +109,10 @@ ansible_user: "{{ login_user }}" inventory_dir: '{{ inventory_dir }}' groups: all + +# add those bm host in mixed cluster to a group +- name: Add hosts to inventory - bm_host + ansible.builtin.add_host: + hostname: "{{ inventory_hostname }}" + groups: "bm_host" + inventory_dir: '{{ inventory_dir }}' diff --git a/roles/vm/prepare_cek/tasks/main.yml b/roles/vm/prepare_cek/tasks/main.yml index 5e86bad4..9f2f0077 100644 --- a/roles/vm/prepare_cek/tasks/main.yml +++ b/roles/vm/prepare_cek/tasks/main.yml @@ -31,7 +31,7 @@ - name: Store primary IPs of running VMs set_fact: vm_ips: "{{ vm_ips | default({}) | combine( {item.item.name: item.stdout.splitlines() | first} ) }}" - when: item.changed and item.item.name is defined # noqa 503 # no-handler + when: item.changed and item.item.name is defined # noqa no-handler loop: "{{ vm_out.results }}" tags: - intel-platform-qat-setup diff --git a/roles/vm/prepare_cek_vxlan/tasks/main.yml b/roles/vm/prepare_cek_vxlan/tasks/main.yml index 4ded3ef5..3bd705cf 100644 --- a/roles/vm/prepare_cek_vxlan/tasks/main.yml +++ b/roles/vm/prepare_cek_vxlan/tasks/main.yml @@ -54,7 +54,7 @@ - name: Store VXLAN MACs of running VMs set_fact: vm_vxlan_macs: "{{ vm_vxlan_macs | default({}) | combine( {item.item.name: item.stdout} ) }}" - when: item.changed and item.item.name is defined # noqa 503 # no-handler + when: item.changed and item.item.name is defined # noqa no-handler loop: "{{ vm_vxlan_mac_out.results }}" - name: Print vm_vxlan_macs @@ -78,7 +78,7 @@ - name: Store VXLAN IPs of running VMs set_fact: vm_vxlan_ips: "{{ vm_vxlan_ips | default({}) | combine( {item.item.name: item.stdout} ) }}" - when: item.changed and item.item.name is defined # noqa 503 # no-handler + when: item.changed and item.item.name is defined # noqa no-handler loop: "{{ vm_vxlan_ip_out.results }}" - name: Print vm_vxlan_ips diff --git a/roles/wait_for_kubernetes_ready/tasks/main.yml b/roles/wait_for_kubernetes_ready/tasks/main.yml index 7c7babb3..839fa221 100644 --- a/roles/wait_for_kubernetes_ready/tasks/main.yml +++ b/roles/wait_for_kubernetes_ready/tasks/main.yml @@ -28,7 +28,7 @@ register: kube_api until: kube_api.status == 200 retries: 10 - delay: 2 + delay: 5 when: not on_cloud | default(false) - name: show all nodes on kubernetes cluster diff --git a/validation/sylva-validation/cnf-validation/README.md b/validation/sylva-validation/cnf-validation/README.md index 80cb4adc..ebfec15e 100644 --- a/validation/sylva-validation/cnf-validation/README.md +++ b/validation/sylva-validation/cnf-validation/README.md @@ -11,6 +11,20 @@ Kubernetes cluster with properly configured SR-IOV Device Plugin, configMap and ### For Docker-based version Docker accessible for user. +From RA 24.01 release, default container management tool changed as podman based on default runtime change as containerd. User need to install docker based on BMRA access profile for stack validation. Reference steps to install and configure docker as below. + +Docker installation +``` +curl -fsSL https://get.docker.com | bash -s docker +``` +Configure user group and restart service +``` +groupadd docker +systemctl daemon-reload +systemctl start docker +systemctl enable docker +``` +Configure proxy for docker if needed. ### For Linux-based version diff --git a/validation/verification-manual/ingress-nginx/README.md b/validation/verification-manual/ingress-nginx/README.md new file mode 100644 index 00000000..a5894072 --- /dev/null +++ b/validation/verification-manual/ingress-nginx/README.md @@ -0,0 +1,100 @@ +# Deployment of Ingress-nginx as Cluster Ingress Controller + +This section describes how to leverage Ingress resources in Cluster + +## Ingress Controller description + +Ingress Controller is application within a Kubernetes cluster that enable Ingress resources to function. It's not automatically deployed with a Kubernetes cluster, and each variant can vary in implementation based on intended use, such as load balancing algorithms for Ingress resources. + +## Ingress resource description + +Ingress refers to an Ingress Resource, a Kubernetes API object which allows access to Services within a cluster. They are managed by an Ingress Controller. + +Ingress resources enable the following functionality: + +- Load balancing, extended through the use of Services +- Content-based routing, using hosts and paths +- TLS/SSL termination, based on hostnames + +For additional information, please read the official Kubernetes Ingress Documentation. + +## Ingress-Nginx as controller in RA deployment (Not applicable to CloudRA) + +The deployment of ingress-nginx on reference architecture cluster has following specifics: + +- There's no support for LoadBalancer type of services +- External connection point for ingress resources is provided by NodePorts 30123 & 30124 reachable via localhost of each node in cluster + +## Validation of Ingress resource + +Sample resource files can be found at [sample_deployment.yaml](sample_deployment.yaml) and [sample_ingress.yaml](sample_ingress.yaml) + +1. Deploy sample workload with ingress resource: + + ```bash + $ kubectl apply -f sample_deployment.yaml + deployment.apps/http-svc created + service/http-svc created + + $ kubectl apply -f sample_ingress.yaml + ingress.networking.k8s.io/example-ingress created + ``` + +2. Check deployment, service and ingress resources status: + + ```bash + $ kubectl get pods + NAME READY STATUS RESTARTS AGE + http-svc-68dd884bb-wds72 1/1 Running 0 20s + + $ kubectl get svc http-svc + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + http-svc ClusterIP 10.233.6.129 80/TCP 27m + + $ kubectl get ingress + NAME CLASS HOSTS ADDRESS PORTS AGE + example-ingress nginx * 80 25s + ``` + +3. Reach new exposed port from ingress from node: + + ```bash + $ curl http://127.0.0.1:30123/echo-server/ + + + Hostname: http-svc-68dd884bb-wds72 + + Pod Information: + node name: am09-05-cyp + pod name: http-svc-68dd884bb-wds72 + pod namespace: default + pod IP: 10.244.39.42 + + Server values: + server_version=nginx: 1.14.2 - lua: 10015 + + Request Information: + client_address=10.244.39.50 + method=GET + real path=/ + query= + request_version=1.1 + request_scheme=http + request_uri=http://127.0.0.1:8080/ + + Request Headers: + accept=*/* + host=127.0.0.1:30123 + user-agent=curl/7.81.0 + x-forwarded-for=10.166.31.87 + x-forwarded-host=127.0.0.1:30123 + x-forwarded-port=80 + x-forwarded-proto=http + x-forwarded-scheme=http + x-real-ip=10.166.31.87 + x-request-id=32daf99bb22f84ae5da6c7d4d233d31c + x-scheme=http + + Request Body: + -no body in request- + ``` diff --git a/validation/verification-manual/ingress-nginx/sample_deployment.yaml b/validation/verification-manual/ingress-nginx/sample_deployment.yaml new file mode 100644 index 00000000..43eaeaaf --- /dev/null +++ b/validation/verification-manual/ingress-nginx/sample_deployment.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: http-svc +spec: + replicas: 1 + selector: + matchLabels: + app: http-svc + template: + metadata: + labels: + app: http-svc + spec: + containers: + - name: http-svc + image: registry.k8s.io/e2e-test-images/echoserver:2.3 + ports: + - containerPort: 8080 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP +--- +apiVersion: v1 +kind: Service +metadata: + name: http-svc + labels: + app: http-svc +spec: + ports: + - port: 80 + targetPort: 8080 + protocol: TCP + name: http + selector: + app: http-svc diff --git a/validation/verification-manual/ingress-nginx/sample_ingress.yaml b/validation/verification-manual/ingress-nginx/sample_ingress.yaml new file mode 100644 index 00000000..2b461e16 --- /dev/null +++ b/validation/verification-manual/ingress-nginx/sample_ingress.yaml @@ -0,0 +1,20 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: example-ingress + labels: + app: http-svc + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / +spec: + ingressClassName: nginx + rules: + - http: + paths: + - path: /echo-server + pathType: Prefix + backend: + service: + name: http-svc + port: + number: 80 diff --git a/validation/verification-manual/intel_media_transport_lib/README.md b/validation/verification-manual/intel_media_transport_lib/README.md index 4d80e908..dafa4239 100644 --- a/validation/verification-manual/intel_media_transport_lib/README.md +++ b/validation/verification-manual/intel_media_transport_lib/README.md @@ -8,48 +8,102 @@ In case of any error, please check provided FAQ at: +- KubeVirt Architecture - +- KubeVirt Configuration - + +## Test instructions + +Test resource yaml can be found at [vm.yaml](vm.yaml). + +1. Apply resource vm.yaml to get Virtual machine created: + + ```bash + $ kubectl create namespace vm-test + namespace/vm-test created + + $ kubectl apply -n vm-test -f vm.yaml + virtualmachine.kubevirt.io/testvm created + ``` + +2. Check VM has been created: + + ```bash + $ kubectl get vms -n vm-test + NAME AGE STATUS READY + testvm 21s Stopped False + ``` + +3. Start created VM + + ```bash + $ virtctl start testvm -n vm-test + VM testvm was scheduled to start + ``` + +4. Check status of VM instance to be `running`: + + ```bash + $ kubectl get vmis -n vm-test + NAME AGE PHASE IP NODENAME READY + testvm 20s Running 10.244.88.198 ad07-07-cyp True + ``` + +5. Access VM's console: + + ```bash + $ virtctl console -n vm-test testvm + Successfully connected to testvm console. The escape sequence is ^] + + ``` + __Note:__ Hit enter to get login page and login with the displayed username and password. You can disconnect from the virtual machine console by typing: `ctrl+]`. + +6. Stop & Delete VM: + + ```bash + $ virtctl stop testvm -n vm-test + VM testvm was scheduled to stop + + $ kubectl delete vm testvm -n vm-test + virtualmachine.kubevirt.io "testvm" deleted + + $ kubectl delete namespace vm-test + namespace "vm-test" deleted + ``` diff --git a/validation/verification-manual/kubevirt/vm.yaml b/validation/verification-manual/kubevirt/vm.yaml new file mode 100644 index 00000000..66c23f2c --- /dev/null +++ b/validation/verification-manual/kubevirt/vm.yaml @@ -0,0 +1,37 @@ +apiVersion: kubevirt.io/v1 +kind: VirtualMachine +metadata: + name: testvm +spec: + running: false + template: + metadata: + labels: + kubevirt.io/size: small + kubevirt.io/domain: testvm + spec: + domain: + devices: + disks: + - name: containerdisk + disk: + bus: virtio + - name: cloudinitdisk + disk: + bus: virtio + interfaces: + - name: default + masquerade: {} + resources: + requests: + memory: 64M + networks: + - name: default + pod: {} + volumes: + - name: containerdisk + containerDisk: + image: quay.io/kubevirt/cirros-container-disk-demo + - name: cloudinitdisk + cloudInitNoCloud: + userDataBase64: SGkuXG4= diff --git a/validation/verification-manual/networking_features/userspace_cni/README.md b/validation/verification-manual/networking_features/userspace_cni/README.md index a9668567..f5e6ffc7 100644 --- a/validation/verification-manual/networking_features/userspace_cni/README.md +++ b/validation/verification-manual/networking_features/userspace_cni/README.md @@ -1,59 +1,50 @@ # Userspace CNI Plugin -This example shows how to use the Userspace CNI Plugin with DPDK enchanced Open vSwitch (OVS-DPDK) to attach vhostuser interfaces to pods in Kubernetes. + +This example shows how to use the Userspace CNI Plugin with DPDK enchanced Open vSwitch (OVS-DPDK) or VPP vSwitch to attach vhostuser interfaces to pods in Kubernetes. ## Check Network Configuration + When Userspace CNI is enabled, an example network attachment definintion is created by default. Start by checking that this definition is available: -``` + +```bash # kubectl get net-attach-def NAME AGE userspace-ovs 14h ``` ## Deploy Workload -With the network attachment definition available, a workload can be deployed that requests an interface through the Userspace CNI Plugin. The provided pod manifest [pod-userspace.yml](pod-userspace.yml) can be used. The content of the file is: -``` ---- -apiVersion: v1 -kind: Pod -metadata: - name: pod-userspace-1 - annotations: - k8s.v1.cni.cncf.io/networks: userspace-ovs -spec: - containers: - - name: pod-userspace-1 - image: ubuntu:focal - command: [ "/bin/bash", "-c" ] - args: [ "sleep inf" ] - volumeMounts: - - mountPath: /vhu/ - name: shared-dir - volumes: - - name: shared-dir - hostPath: - path: /var/lib/cni/vhostuser/ -``` + +With the network attachment definition available, a workload can be deployed that requests an interface through the Userspace CNI Plugin. The provided pod manifest [pod-userspace-ovs.yml](pod-userspace-ovs.yml) can be used. In addition to requesting the network interface through `k8s.v1.cni.cncf.io/networks: userspace-ovs`, a volume mount is added that is used for the vhostuser socket created by the Userspace CNI. Deploy the pod: -``` + +```bash # kubectl apply -f pod-userspace.yml ``` ## Verify Network + Start by verifying that a vhostuser socket has been added to the pod: -``` -# kubectl exec pod-userspace-1 -- ls /vhu/ + +```bash +kubectl exec pod-userspace-1 -- ls /vhu/ + 5dee26822a53-net1 ``` + If there are multiple worker nodes in the cluster, check which one the pod has been deployed on: -``` -# kubectl describe pod pod-userspace-1 | grep Node: + +```bash +kubectl describe pod pod-userspace-1 | grep Node: + Node: node1/ ``` Connect to that node using the IP found above, and verify that the vhostuser socket and interface has been added to OVS-DPDK: -``` -# ovs-vsctl show + +```bash +ovs-vsctl show + 6836950b-fe14-42f7-823b-06ae680b88f4 Bridge br0 datapath_type: netdev @@ -65,4 +56,5 @@ Connect to that node using the IP found above, and verify that the vhostuser soc type: dpdkvhostuser ovs_version: "2.17.2" ``` -At this point, the vhostuser socket is ready to use in the pod. The steps for using VPP as the vSwitch are similar, but instead of the Userspace CNI resource name userspace-ovs, use userspace-vpp. + +At this point, the vhostuser socket is ready to use in the pod. The steps for using VPP as the vSwitch are similar, but instead of the Userspace CNI resource name userspace-ovs, use userspace-vpp. The provided pod manifest [pod-userspace-vpp.yml](pod-userspace-vpp.yml) can be used. diff --git a/validation/verification-manual/networking_features/userspace_cni/pod-userspace.yml b/validation/verification-manual/networking_features/userspace_cni/pod-userspace-ovs.yaml similarity index 87% rename from validation/verification-manual/networking_features/userspace_cni/pod-userspace.yml rename to validation/verification-manual/networking_features/userspace_cni/pod-userspace-ovs.yaml index 3d421aee..b73c6111 100644 --- a/validation/verification-manual/networking_features/userspace_cni/pod-userspace.yml +++ b/validation/verification-manual/networking_features/userspace_cni/pod-userspace-ovs.yaml @@ -2,12 +2,12 @@ apiVersion: v1 kind: Pod metadata: - name: pod-userspace-1 + name: pod-userspace-ovs annotations: k8s.v1.cni.cncf.io/networks: userspace-ovs spec: containers: - - name: pod-userspace-1 + - name: pod-userspace image: ubuntu:focal command: [ "/bin/bash", "-c" ] args: [ "sleep inf" ] diff --git a/validation/verification-manual/networking_features/userspace_cni/pod-userspace-vpp.yaml b/validation/verification-manual/networking_features/userspace_cni/pod-userspace-vpp.yaml new file mode 100644 index 00000000..317a608b --- /dev/null +++ b/validation/verification-manual/networking_features/userspace_cni/pod-userspace-vpp.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: Pod +metadata: + name: pod-userspace-vpp + annotations: + k8s.v1.cni.cncf.io/networks: userspace-vpp +spec: + containers: + - name: pod-userspace + image: ubuntu:focal + command: [ "/bin/bash", "-c" ] + args: [ "sleep inf" ] diff --git a/validation/verification-manual/power_manager/README.md b/validation/verification-manual/power_manager/README.md index ac576e13..b2999443 100644 --- a/validation/verification-manual/power_manager/README.md +++ b/validation/verification-manual/power_manager/README.md @@ -104,6 +104,7 @@ To check Uncore Frequency, you can use following path: ``` You can then check files 'max_freq_khz' or 'min_freq_khz' which should store your desired Uncore Frequency values. **Note:** Valid min and max values are determined by hardware. Die config will precede Package config, which will precede system-wide config. +**Note:** To prevent both min and max frequency values of being the same value, please set "System BIOS > System Profile Settings > Uncore Frequency" from "Maximum" to "Dynamic" in machine's BIOS. To set up C-States, you can choose from three different options: ``` @@ -138,7 +139,57 @@ To check your desired scaling driver, you can do following: ``` # cat /sys/devices/system/cpu/cpuX/cpufreq/scaling_driver ``` -And to check you desired scaling governor: +**Note:** Scaling driver's available are: intel_pstate, intel_cpufreq + +To check your desired scaling governor: ``` # cat /sys/devices/system/cpu/cpuX/cpufreq/scaling_governor ``` +You can also check all available scaling governors: +``` +# cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors +``` +Time of Day can be used to schedule change of powerProfile or C-State for sharedPool, or powerProfile for sample pod. Schedule examples: +``` +schedule: + - time: "14:24" + # powerProfile sets the profile for the shared pool + powerProfile: balance-performance + + # this transitions exclusive pods matching a given label from one profile to another + # please ensure that only pods to be used by power manager have this label + pods: + - labels: + matchLabels: + power: "true" + target: balance-performance + - labels: + matchLabels: + special: "false" + target: balance-performance + + # cState field simply takes a cstate spec + cState: + sharedPoolCStates: + C1: false + C6: true + + - time: "14:26" + powerProfile: performance + cState: + sharedPoolCStates: + C1: true + C6: false + + - time: "14:28" + powerProfile: balance-power + pods: + - labels: + matchLabels: + power: "true" + target: balance-power + - labels: + matchLabels: + special: "false" + target: performance +``` diff --git a/vars_plugins/git_revision.py b/vars_plugins/git_revision.py new file mode 100644 index 00000000..fd62e0f2 --- /dev/null +++ b/vars_plugins/git_revision.py @@ -0,0 +1,21 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from ansible.plugins.vars import BaseVarsPlugin +from git import Repo + +class VarsModule(BaseVarsPlugin): + def get_vars(self, loader, path, entities): + try: + repository = Repo(path, search_parent_directories = True) + except Exception: + # RA code directory is not versioned + return dict( + ra_is_git = False + ) + + return dict( + ra_git_commit = repository.head.reference.commit, + ra_git_is_dirty = repository.is_dirty(untracked_files = True), + ra_is_git = True + )