From 071426c223fe1e8afedf8f67736a2cb08d765573 Mon Sep 17 00:00:00 2001
From: James Parker <jparker@redhat.com>
Date: Fri, 14 Jan 2022 12:59:36 -0500
Subject: [PATCH] Test resize with mem_page_size in flavor

These tests are meant to address issue [1]. It adds three new testcases:

 * test_hugepage_resize_keyword_large_to_small
 * test_hugepage_resize_explicit_pagesize_to_small
 * test_hugepage_resize_explicit_size_to_size

All three tests follow the same basic procedure, spawn a guest with a
flavor using hw:mem_page_size:<size_a>, resize the guest to a flavor
with a different size hw:mem_page_size:<size_b>, and then resize the
guest back to the original flavor. Throughout the tests XML checks are
conducted to ensure the page size is accurate for the present flavor.

Instead of trying to dynamically determine the hugepage sizes configured
on the computes, a new config parameter was added to define what
hugepage sizes are available on the host. To avoid dynamic ram
calculation sizes for the guest based on available hugepages, a guest
ram parameter was also added so users may define the size to use when
spawning guests.

We also need a new job that has multiple hugepage sizes configured. We
cannot use our existing whitebox-devstack-multinode job because that
one runs tests that dynamically turn on file backed memory, which is
incompatible with hugepages. This commit adds tasks into job setup
that allows for the setup of hugepages.

In our devstack plugin.sh, we set track_instance_changes to True
(devstack defaults it to False) to make sure the scheduler has the
latest information about available huge pages, and avoid a race
whereing instances failed to schedule because our lone 1G page
still appeared used by an instance that had actually beed fully
deleted.

[1] https://bugs.launchpad.net/nova/+bug/1831269

Change-Id: I5282df3b20c24a909f3b7bb97214206bc07e5b91
---
 .zuul.yaml                                    |  32 ++-
 devstack/plugin.sh                            |   3 +
 devstack/settings                             |   2 +
 playbooks/whitebox/pre.yaml                   |  43 +++-
 whitebox_tempest_plugin/api/compute/base.py   |   8 +
 .../api/compute/test_cpu_pinning.py           |   8 -
 .../api/compute/test_hugepages.py             | 196 ++++++++++++++++++
 whitebox_tempest_plugin/config.py             |  13 +-
 8 files changed, 289 insertions(+), 16 deletions(-)
 create mode 100644 whitebox_tempest_plugin/api/compute/test_hugepages.py

diff --git a/.zuul.yaml b/.zuul.yaml
index 39ba9813..9e0c55ff 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -3,7 +3,11 @@
     nodes:
       - name: controller
         label: nested-virt-ubuntu-jammy
-      - name: compute
+        # NOTE(artom) We can't name the node 'compute' because that seems to
+        # take precedence over the 'compute' group in playbooks, so things we
+        # want to run on all hosts in the 'compute' group would only run on the
+        # subnode.
+      - name: compute-host
         label: nested-virt-ubuntu-jammy
     groups:
       # Node where tests are executed and test results collected
@@ -14,11 +18,11 @@
       - name: compute
         nodes:
           - controller
-          - compute
+          - compute-host
       # Nodes that are not the controller
       - name: subnode
         nodes:
-          - compute
+          - compute-host
       # Switch node for multinode networking setup
       - name: switch
         nodes:
@@ -26,7 +30,7 @@
       # Peer nodes for multinode networking setup
       - name: peers
         nodes:
-          - compute
+          - compute-host
 
 - job:
     name: whitebox-devstack-multinode
@@ -49,6 +53,7 @@
       # open source implementation of UEFI for VMs via the OVMF package. In
       # addition to test vTPM hosts need swtpm as well
       extra_packages: ovmf,swtpm-tools
+      tempest_exclude_regex: ^whitebox_tempest_plugin\.api\.compute\.test_hugepages
       devstack_localrc:
         MAX_COMPUTE_NODES: 2
         NOVA_SERVICE_REPORT_INTERVAL: 10
@@ -86,7 +91,6 @@
               swtpm_group: swtpm
     group-vars:
       subnode:
-        num_hugepages: 2048
         devstack_localrc:
           LIBVIRT_TYPE: kvm
           NOVA_SERVICE_REPORT_INTERVAL: 10
@@ -109,11 +113,24 @@
                 swtpm_user: swtpm
                 swtpm_group: swtpm
       tempest:
-        num_hugepages: 512
         devstack_plugins:
           barbican: https://opendev.org/openstack/barbican.git
           whitebox-tempest-plugin: https://opendev.org/openstack/whitebox-tempest-plugin.git
 
+- job:
+    name: whitebox-devstack-multinode-hugepages
+    parent: whitebox-devstack-multinode
+    description: |
+      Runs the hugepages tests on a deployment that has set up hugepages on the hosts.
+    vars:
+      tempest_test_regex: ^whitebox_tempest_plugin\.api\.compute\.test_hugepages
+      # NOTE(artom) The parent job's exclude regex excludes the hugepages
+      # tests, so we need to overwrite it here with a regex that matches
+      # *nothing*.
+      tempest_exclude_regex: $^
+      num_2M_pages: 512
+      num_1G_pages: 1
+
 - job:
     name: whitebox-devstack-ceph-multinode
     parent: devstack-plugin-ceph-multinode-tempest-py3
@@ -174,3 +191,6 @@
         - whitebox-devstack-multinode
         - whitebox-devstack-ceph-multinode
         - openstack-tox-pep8
+    experimental:
+      jobs:
+        - whitebox-devstack-multinode-hugepages
diff --git a/devstack/plugin.sh b/devstack/plugin.sh
index 44a9e063..c8940541 100644
--- a/devstack/plugin.sh
+++ b/devstack/plugin.sh
@@ -19,6 +19,7 @@ function configure {
     iniset $TEMPEST_CONFIG whitebox default_video_model $WHITEBOX_DEFAULT_VIDEO_MODEL
     iniset $TEMPEST_CONFIG whitebox max_disk_devices_to_attach $WHITEBOX_MAX_DISK_DEVICES_TO_ATTACH
     iniset $TEMPEST_CONFIG whitebox nodes_yaml $WHITEBOX_NODES_YAML
+    iniset $TEMPEST_CONFIG whitebox hugepage_guest_ram_size $WHITEBOX_HUGEPAGE_GUEST_RAM_SIZE
 
     iniset $TEMPEST_CONFIG whitebox-database user $DATABASE_USER
     iniset $TEMPEST_CONFIG whitebox-database password $DATABASE_PASSWORD
@@ -27,6 +28,7 @@ function configure {
     iniset $TEMPEST_CONFIG whitebox-hardware cpu_topology "$WHITEBOX_CPU_TOPOLOGY"
     iniset $TEMPEST_CONFIG whitebox-hardware dedicated_cpus_per_numa "$WHITEBOX_DEDICATED_CPUS_PER_NUMA"
     iniset $TEMPEST_CONFIG whitebox-hardware shared_cpus_per_numa "$WHITEBOX_SHARED_CPUS_PER_NUMA"
+    iniset $TEMPEST_CONFIG whitebox-hardware configured_hugepage_sizes "$WHITEBOX_CONFIGURED_HUGEPAGES"
 
     iniset $TEMPEST_CONFIG compute-feature-enabled virtio_rng "$COMPUTE_FEATURE_VIRTIO_RNG"
     iniset $TEMPEST_CONFIG compute-feature-enabled rbd_download "$COMPUTE_FEATURE_RBD_DOWNLOAD"
@@ -39,6 +41,7 @@ function configure {
     # https://github.com/openstack/devstack/blob/6b0f055b4ed407f8a190f768d0e654235ac015dd/lib/nova#L46C36-L46C50
     iniset $TEMPEST_CONFIG whitebox-nova-compute state_path $DATA_DIR/nova
 
+    iniset $NOVA_CONF filter_scheduler track_instance_changes True
 }
 
 if [[ "$1" == "stack" ]]; then
diff --git a/devstack/settings b/devstack/settings
index 23a946e8..17f3dc58 100644
--- a/devstack/settings
+++ b/devstack/settings
@@ -7,10 +7,12 @@ WHITEBOX_RX_QUEUE_SIZE=${WHITEBOX_RX_QUEUE_SIZE:-1024}
 WHITEBOX_DEFAULT_VIDEO_MODEL=${WHITEBOX_DEFAULT_VIDEO_MODEL:-'virtio'}
 WHITEBOX_MAX_DISK_DEVICES_TO_ATTACH=${WHITEBOX_MAX_DISK_DEVICES_TO_ATTACH:-7}
 WHITEBOX_NODES_YAML=${WHITEBOX_NODES_YAML:-'/home/zuul/compute_nodes.yaml'}
+WHITEBOX_HUGEPAGE_GUEST_RAM_SIZE=${WHITEBOX_HUGEPAGE_GUEST_RAM_SIZE:-1024}
 
 WHITEBOX_CPU_TOPOLOGY=${WHITEBOX_CPU_TOPOLOGY:-''}
 WHITEBOX_DEDICATED_CPUS_PER_NUMA=${WHITEBOX_DEDICATED_CPUS_PER_NUMA:-4}
 WHITEBOX_SHARED_CPUS_PER_NUMA=${WHITEBOX_SHARED_CPUS_PER_NUMA:-2}
+WHITEBOX_CONFIGURED_HUGEPAGES=${WHITEBOX_CONFIGURED_HUGEPAGES:-'2048,1048576'}
 
 COMPUTE_FEATURE_VIRTIO_RNG=${COMPUTE_FEATURE_VIRTIO_RNG:-'True'}
 COMPUTE_FEATURE_RBD_DOWNLOAD=${COMPUTE_FEATURE_RBD_DOWNLOAD:-'False'}
diff --git a/playbooks/whitebox/pre.yaml b/playbooks/whitebox/pre.yaml
index ba4dbe4b..42e2e819 100644
--- a/playbooks/whitebox/pre.yaml
+++ b/playbooks/whitebox/pre.yaml
@@ -44,4 +44,45 @@
       shell: |
         cat /home/zuul/compute_nodes.yaml
       run_once: true
-      delegate_to: controller
\ No newline at end of file
+      delegate_to: controller
+
+- hosts: compute
+  tasks:
+    - name: Create hugepages for computes
+      block:
+
+        - name: Append to GRUB command line
+          lineinfile:
+            path: /etc/default/grub
+            state: present
+            backrefs: yes
+            regexp: GRUB_CMDLINE_LINUX="([^"]*)"
+            line: GRUB_CMDLINE_LINUX="\1 hugepagesz=2M hugepages={{ num_2M_pages }} hugepagesz=1G hugepages={{ num_1G_pages }} transparent_hugepage=never"
+          become: yes
+
+        - name: Update grub.cfg
+          # NOTE(artom) This assumes an Ubuntu host
+          command: update-grub2
+          become: yes
+
+        - name: Reboot
+          reboot:
+          become: yes
+
+        - name: (Re-)start the Zuul console streamer after the reboot
+          # NOTE(artom) The job will still work if we don't do this, but the
+          # console will get spammed with 'Waiting on logger' messages. See
+          # https://bugs.launchpad.net/openstack-gate/+bug/1806655 for more
+          # info.
+          import_role:
+            name: start-zuul-console
+
+        - name: Add 1G hugetlbfs mount
+          # The 2M hugetlbfs is mounted automatically by the OS, but we need to
+          # manually add the 1G mount.
+          shell: |
+            mkdir /dev/hugepages1G
+            mount -t hugetlbfs -o pagesize=1G none /dev/hugepages1G
+          become: yes
+
+      when: num_2M_pages is defined and num_1G_pages is defined
diff --git a/whitebox_tempest_plugin/api/compute/base.py b/whitebox_tempest_plugin/api/compute/base.py
index 84a0f4ff..9ba1c00f 100644
--- a/whitebox_tempest_plugin/api/compute/base.py
+++ b/whitebox_tempest_plugin/api/compute/base.py
@@ -435,3 +435,11 @@ class BaseWhiteboxComputeTest(base.BaseV2ComputeAdminTest):
                            'status = "%s"' % status)
             data = cursor.fetchall()
         return data[0]['COUNT(*)']
+
+    def _get_hugepage_xml_element(self, server_id):
+        """Gather and return all instances of the page element from XML element
+        'memoryBacking/hugepages' in a given server's domain.
+        """
+        root = self.get_server_xml(server_id)
+        huge_pages = root.findall('.memoryBacking/hugepages/page')
+        return huge_pages
diff --git a/whitebox_tempest_plugin/api/compute/test_cpu_pinning.py b/whitebox_tempest_plugin/api/compute/test_cpu_pinning.py
index 732a5afc..d59bf30e 100644
--- a/whitebox_tempest_plugin/api/compute/test_cpu_pinning.py
+++ b/whitebox_tempest_plugin/api/compute/test_cpu_pinning.py
@@ -612,14 +612,6 @@ class NUMALiveMigrationBase(BasePinningTest):
         cpuset = root.find('./vcpu').attrib.get('cpuset', None)
         return hardware.parse_cpu_spec(cpuset)
 
-    def _get_hugepage_xml_element(self, server_id):
-        """Gather and return all instances of the page element from XML element
-        'memoryBacking/hugepages' in a given server's domain.
-        """
-        root = self.get_server_xml(server_id)
-        huge_pages = root.findall('.memoryBacking/hugepages/page')
-        return huge_pages
-
     def _validate_hugepage_elements(self, server_id, pagesize):
         """Analyze the hugepage xml element(s) from a provided instance. Expect
         to find only one hugepage element in the domain. Return boolean result
diff --git a/whitebox_tempest_plugin/api/compute/test_hugepages.py b/whitebox_tempest_plugin/api/compute/test_hugepages.py
new file mode 100644
index 00000000..85a148a6
--- /dev/null
+++ b/whitebox_tempest_plugin/api/compute/test_hugepages.py
@@ -0,0 +1,196 @@
+# Copyright 2022 Red Hat Inc.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from tempest import config
+import testtools
+from whitebox_tempest_plugin.api.compute import base
+
+from oslo_log import log as logging
+
+CONF = config.CONF
+LOG = logging.getLogger(__name__)
+
+
+class HugePageResize(base.BaseWhiteboxComputeTest):
+
+    @classmethod
+    def skip_checks(cls):
+        super(HugePageResize, cls).skip_checks()
+        if len(getattr(CONF.whitebox_hardware,
+                       'configured_hugepage_sizes')) == 0:
+            msg = "configured_hugepage_sizes in whitebox-hardware is not " \
+                  "present"
+            raise cls.skipException(msg)
+
+    def _get_xml_hugepage_size(self, server_id):
+        """Analyze the hugepage xml element(s) from a provided instance. Expect
+        to find only one hugepage element in the domain. Return boolean result
+        comparing if the found page size is equal to the expected page size.
+        """
+        huge_pages_list = self._get_hugepage_xml_element(server_id)
+        self.assertEqual(1, len(huge_pages_list), "Expected to find 1 "
+                         "hugepage XML element on server %s but found %s"
+                         % (server_id, len(huge_pages_list)))
+        huge_page_xml = huge_pages_list[0]
+        return int(huge_page_xml.attrib['size'])
+
+    def test_hugepage_resize_large_to_small(self):
+        """Resize a guest with large hugepages to small hugepages and back
+
+        Create a guest using a flavor with hw:mem_page_size:large, resize it
+        to a flavor with hw:mem_page_size:small, and then resize it back to
+        the original flavor
+        """
+        flavor_a = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': 'large'})
+
+        server = self.create_test_server(flavor=flavor_a['id'],
+                                         wait_until='ACTIVE')
+
+        # Cannot assume the exact pagesize of the guest, verify the backing
+        # memory element is present on the guest and the found size is greater
+        # than or equal to the smallest potential size configured in the
+        # environment
+        large_page_size = self._get_xml_hugepage_size(server['id'])
+        minimum_pagesize_threshold = \
+            min(CONF.whitebox_hardware.configured_hugepage_sizes)
+        self.assertTrue(
+            large_page_size >= minimum_pagesize_threshold,
+            "Pagesize found %s should be greater than or equal to pagesize "
+            "of %s for server %s" %
+            (large_page_size, minimum_pagesize_threshold, server['id'])
+        )
+
+        # Resize the guest using a flavor with hw:mem_page_size:small,
+        # memory backing element should not be present on guest currently so
+        # no need for XML verification
+        flavor_b = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': 'small'})
+        self.resize_server(server['id'], flavor_b['id'])
+
+        # Resize instance back to staring flavor size and repeat XML check of
+        # the guest
+        self.resize_server(server['id'], flavor_a['id'])
+        large_page_size = self._get_xml_hugepage_size(server['id'])
+        self.assertTrue(
+            large_page_size >= minimum_pagesize_threshold,
+            "After resizing back to original flavor, pagesize found %s should "
+            "be greater than or equal to pagesize of %s for server %s" %
+            (large_page_size, minimum_pagesize_threshold, server['id'])
+        )
+
+    def test_hugepage_resize_size_to_small(self):
+        """Resize a guest with a specified hugepage size to small hugepages
+
+        Create a guest using a flavor with using an explicit hugepage size(s),
+        based on what is configured in whitebox_hardware. Resize the guest to a
+        flavor with hw:mem_page_size:small, and then resize it back to the
+        original flavor. Repeat this process for every hugepage size configured
+        in in whitebox_hardware.configured_hugepage_sizes
+        """
+        flavor_small = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': 'small'})
+
+        # Create a flavor and launch an instance based on every configured
+        # hugepage size in the deployment.
+        for page_size in CONF.whitebox_hardware.configured_hugepage_sizes:
+            flavor_a = self.create_flavor(
+                ram=str(CONF.whitebox.hugepage_guest_ram_size),
+                extra_specs={'hw:mem_page_size': str(page_size)})
+
+            server = self.create_test_server(flavor=flavor_a['id'],
+                                             wait_until='ACTIVE')
+
+            size_found = self._get_xml_hugepage_size(server['id'])
+            self.assertTrue(
+                page_size == size_found,
+                "Expected pagesize of %s not found on server %s instead "
+                "found %s" % (page_size, server['id'], size_found)
+            )
+
+            # Resize the guest using a flavor with hw:mem_page_size:small,
+            # memory backing will not be present in with guest so follow up
+            # XML verification is not necessary
+            self.resize_server(server['id'], flavor_small['id'])
+
+            # Resize back to its original size and confirm memory backing
+            # element is present and has the correct size
+            self.resize_server(server['id'], flavor_a['id'])
+            size_found = self._get_xml_hugepage_size(server['id'])
+            self.assertTrue(
+                page_size == size_found,
+                "Expected pagesize of %s not found on server %s after "
+                "resizing back to original flavor size, instead found %s" %
+                (page_size, server['id'], size_found)
+            )
+
+            self.delete_server(server['id'])
+
+    @testtools.skipUnless(
+        len(CONF.whitebox_hardware.configured_hugepage_sizes) > 1,
+        'Need at least 2 configured hugepage sizes to execute test')
+    def test_hugepage_resize_size_to_size(self):
+        """Resize a guest with a specified hugepage size to another size
+
+        Create two flavors based on the two provided hugepage sizes.  The
+        flavors created use explicit sizes Create a
+        server using the first flavor, resize the guest to the second flavor,
+        and resize back to the original spec
+        """
+        start_size, target_size = \
+            CONF.whitebox_hardware.configured_hugepage_sizes
+
+        flavor_a = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': str(start_size)})
+
+        server = self.create_test_server(flavor=flavor_a['id'],
+                                         wait_until='ACTIVE')
+
+        size_found = self._get_xml_hugepage_size(server['id'])
+        self.assertTrue(
+            start_size == size_found,
+            "Expected pagesize of %s not found on server %s instead "
+            "found %s" % (start_size, server['id'], size_found)
+        )
+
+        flavor_b = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': str(target_size)})
+
+        # Resize to the target size and confirm memory backing element is
+        # present and has the correct size
+        self.resize_server(server['id'], flavor_b['id'])
+        size_found = self._get_xml_hugepage_size(server['id'])
+        self.assertTrue(
+            target_size == size_found,
+            "Expected pagesize of %s not found on server %s after resize "
+            "instead found %s" % (target_size, server['id'], size_found)
+        )
+
+        # Resize back to its original size and confirm memory backing
+        # element is present and has the correct size
+        self.resize_server(server['id'], flavor_a['id'])
+        size_found = self._get_xml_hugepage_size(server['id'])
+
+        self.assertTrue(
+            start_size == size_found,
+            "Expected pagesize of %s not found on server %s after resizing "
+            "back to original flavor size, instead found %s" %
+            (start_size, server['id'], size_found)
+        )
diff --git a/whitebox_tempest_plugin/config.py b/whitebox_tempest_plugin/config.py
index 8394f415..40fd9cbb 100644
--- a/whitebox_tempest_plugin/config.py
+++ b/whitebox_tempest_plugin/config.py
@@ -133,7 +133,12 @@ general_opts = [
         'libvirt_hw_machine_type',
         default='pc',
         choices=["pc", "q35"],
-        help='The machine type configured for the nova computes')
+        help='The machine type configured for the nova computes'),
+    cfg.IntOpt(
+        'hugepage_guest_ram_size',
+        default=64,
+        help="RAM size in MB to use when launching the guests backed "
+             "by hugepages."),
 ]
 
 nova_compute_group = cfg.OptGroup(
@@ -224,6 +229,12 @@ hardware_opts = [
              '<List of CPUs in that node>. For example, if NUMA node 0 has '
              'CPUs 0 and 1, and NUMA node 1 has CPUs 2 and 3, the value to '
              'set would be `0: [0,1], 1: [2, 3]`.'),
+    cfg.Opt(
+        'configured_hugepage_sizes',
+        type=types.List(types.Integer()),
+        default=[],
+        help='List of configured hugepage sizes available in kB in the '
+             'environment e.g. 2048,1048576'),
     cfg.IntOpt(
         'dedicated_cpus_per_numa',
         default=0,