Merge "use cinder migrate for swap volume" into stable/2025.1

use cinder migrate for swap volume
This change removes watchers in tree functionality for swapping instance volumes and defines swap as an alias of cinder volume migrate. The watcher native implementation was missing error handling which could lead to irretrievable data loss. The removed code also forged project user credentials to perform admin request as if it was done by a member of a project. this was unsafe an posses a security risk due to how it was implemented. This code has been removed without replacement. While some effort has been made to allow existing audits that were defined to work, any reduction of functionality as a result of this security hardening is intentional. Closes-Bug: #2112187 Change-Id: Ic3b6bfd164e272d70fe86d7b182478dd962f8ac0 Signed-off-by: Sean Mooney <work@seanmooney.info> (cherry picked from commit 3742e0a79c)
2025-08-19 13:39:36 +00:00 · 2025-08-18 16:36:38 +00:00 · 2025-08-04 12:18:53 +00:00 · 2025-07-21 19:28:47 +00:00 · 2025-07-21 19:28:45 +00:00 · 2025-07-21 19:28:43 +00:00
34 changed files with 652 additions and 422 deletions
--- a/.gitreview
+++ b/.gitreview
@@ -2,3 +2,4 @@
 host=review.opendev.org
 port=29418
 project=openstack/watcher.git
 defaultbranch=stable/2025.1
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -34,6 +34,7 @@
    vars:
      tempest_concurrency: 1
      tempest_test_regex: watcher_tempest_plugin.tests.scenario.test_execute_strategies
      tempest_exclude_regex: .*\[.*\breal_load\b.*\].*
 - job:
    name: watcher-tempest-multinode
@@ -213,10 +214,12 @@
        CEILOMETER_BACKENDS: "sg-core"
        CEILOMETER_PIPELINE_INTERVAL: 15
        CEILOMETER_ALARM_THRESHOLD: 6000000000
        NODE_EXPORTER_ENABLE: false
        PROMETHEUS_ENABLE: false
        PROMETHEUS_SERVICE_SCRAPE_TARGETS: "sg-core,node-exporter"
        PROMETHEUS_CONFIG_FILE: "/home/zuul/prometheus.yml"
        # Disable sg_core prometheus config copy
        PROMETHEUS_ENABLE: false
        # PROMETHEUS_CONFIG_FILE var conflicts with sg_core var
        # to avoid issue, set PROMETHEUS_CONF_DIR
        PROMETHEUS_CONF_DIR: "/home/zuul"
      devstack_local_conf:
        post-config:
          $WATCHER_CONF:
@@ -253,6 +256,7 @@
      tempest_plugins:
        - watcher-tempest-plugin
      tempest_test_regex: watcher_tempest_plugin.tests.scenario.test_execute_strategies
      tempest_exclude_regex: .*\[.*\breal_load\b.*\].*
      tempest_concurrency: 1
      tox_envlist: all
      zuul_copy_output:
@@ -261,7 +265,6 @@
      subnode:
        devstack_plugins:
          ceilometer: https://opendev.org/openstack/ceilometer
          sg-core: https://github.com/openstack-k8s-operators/sg-core
          devstack-plugin-prometheus: https://opendev.org/openstack/devstack-plugin-prometheus
        devstack_services:
          ceilometer-acompute: true
@@ -271,9 +274,9 @@
        devstack_localrc:
          CEILOMETER_BACKEND: "none"
          CEILOMETER_BACKENDS: "none"
-          # sg_core related var
+          # avoid collecting real host cpu metric since tests
-          NODE_EXPORTER_ENABLE: false
+          # will inject fake metrics when needed
-          PROMETHEUS_ENABLE: false
+          NODE_EXPORTER_COLLECTOR_EXCLUDE: "cpu"
        devstack_local_conf:
          post-config:
            $WATCHER_CONF:
--- a/devstack/lib/watcher
+++ b/devstack/lib/watcher
@@ -55,11 +55,7 @@ else
    WATCHER_BIN_DIR=$(get_python_exec_prefix)
 fi
-# There are 2 modes, which is "uwsgi" which runs with an apache
+WATCHER_UWSGI=watcher.wsgi.api:application
 # proxy uwsgi in front of it, or "mod_wsgi", which runs in
 # apache. mod_wsgi is deprecated, don't use it.
 WATCHER_USE_WSGI_MODE=${WATCHER_USE_WSGI_MODE:-$WSGI_MODE}
 WATCHER_UWSGI=$WATCHER_BIN_DIR/watcher-api-wsgi
 WATCHER_UWSGI_CONF=$WATCHER_CONF_DIR/watcher-uwsgi.ini
 if is_suse; then
@@ -73,11 +69,7 @@ WATCHER_SERVICE_PORT=${WATCHER_SERVICE_PORT:-9322}
 WATCHER_SERVICE_PORT_INT=${WATCHER_SERVICE_PORT_INT:-19322}
 WATCHER_SERVICE_PROTOCOL=${WATCHER_SERVICE_PROTOCOL:-$SERVICE_PROTOCOL}
 if [[ "$WATCHER_USE_WSGI_MODE" == "uwsgi" ]]; then
 WATCHER_API_URL="$WATCHER_SERVICE_PROTOCOL://$WATCHER_SERVICE_HOST/infra-optim"
 else
    WATCHER_API_URL="$WATCHER_SERVICE_PROTOCOL://$WATCHER_SERVICE_HOST:$WATCHER_SERVICE_PORT"
 fi
 # Entry Points
 # ------------
@@ -101,11 +93,7 @@ function _cleanup_watcher_apache_wsgi {
 # runs that a clean run would need to clean up
 function cleanup_watcher {
    sudo rm -rf $WATCHER_STATE_PATH
    if [[ "$WATCHER_USE_WSGI_MODE" == "uwsgi" ]]; then
    remove_uwsgi_config "$WATCHER_UWSGI_CONF" "$WATCHER_UWSGI"
    else
        _cleanup_watcher_apache_wsgi
    fi
 }
 # configure_watcher() - Set config files, create data dirs, etc
@@ -154,31 +142,6 @@ function create_watcher_accounts {
        "$WATCHER_API_URL"
 }
 # _config_watcher_apache_wsgi() - Set WSGI config files of watcher
 function _config_watcher_apache_wsgi {
    local watcher_apache_conf
    if [[ "$WATCHER_USE_WSGI_MODE" == "mod_wsgi" ]]; then
        local service_port=$WATCHER_SERVICE_PORT
        if is_service_enabled tls-proxy; then
            service_port=$WATCHER_SERVICE_PORT_INT
            service_protocol="http"
        fi
        sudo mkdir -p $WATCHER_WSGI_DIR
        sudo cp $WATCHER_DIR/watcher/api/app.wsgi $WATCHER_WSGI_DIR/app.wsgi
        watcher_apache_conf=$(apache_site_config_for watcher-api)
        sudo cp $WATCHER_DEVSTACK_FILES_DIR/apache-watcher-api.template $watcher_apache_conf
        sudo sed -e "
            s|%WATCHER_SERVICE_PORT%|$service_port|g;
            s|%WATCHER_WSGI_DIR%|$WATCHER_WSGI_DIR|g;
            s|%USER%|$STACK_USER|g;
            s|%APIWORKERS%|$API_WORKERS|g;
            s|%APACHE_NAME%|$APACHE_NAME|g;
        " -i $watcher_apache_conf
        enable_apache_site watcher-api
   fi
 }
 # create_watcher_conf() - Create a new watcher.conf file
 function create_watcher_conf {
    # (Re)create ``watcher.conf``
@@ -196,11 +159,6 @@ function create_watcher_conf {
        iniset $WATCHER_CONF api host "$(ipv6_unquote $WATCHER_SERVICE_HOST)"
        iniset $WATCHER_CONF api port "$WATCHER_SERVICE_PORT_INT"
        # iniset $WATCHER_CONF api enable_ssl_api "True"
    else
        if [[ "$WATCHER_USE_WSGI_MODE" == "mod_wsgi" ]]; then
            iniset $WATCHER_CONF api host "$(ipv6_unquote $WATCHER_SERVICE_HOST)"
            iniset $WATCHER_CONF api port "$WATCHER_SERVICE_PORT"
        fi
    fi
    iniset $WATCHER_CONF oslo_policy policy_file $WATCHER_POLICY_YAML
@@ -228,12 +186,8 @@ function create_watcher_conf {
    # Format logging
    setup_logging $WATCHER_CONF
-    #config apache files
+    write_uwsgi_config "$WATCHER_UWSGI_CONF" "$WATCHER_UWSGI" "/infra-optim" "" "watcher-api"
-    if [[ "$WATCHER_USE_WSGI_MODE" == "uwsgi" ]]; then
+
        write_uwsgi_config "$WATCHER_UWSGI_CONF" "$WATCHER_UWSGI" "/infra-optim"
    else
        _config_watcher_apache_wsgi
    fi
    # Register SSL certificates if provided
    if is_ssl_enabled_service watcher; then
        ensure_certificates WATCHER
@@ -273,9 +227,6 @@ function install_watcherclient {
 function install_watcher {
    git_clone $WATCHER_REPO $WATCHER_DIR $WATCHER_BRANCH
    setup_develop $WATCHER_DIR
    if [[ "$WATCHER_USE_WSGI_MODE" == "mod_wsgi" ]]; then
        install_apache_wsgi
    fi
 }
 # start_watcher_api() - Start the API process ahead of other things
@@ -289,19 +240,10 @@ function start_watcher_api {
        service_port=$WATCHER_SERVICE_PORT_INT
        service_protocol="http"
    fi
    if [[ "$WATCHER_USE_WSGI_MODE" == "uwsgi" ]]; then
    run_process "watcher-api" "$(which uwsgi) --procname-prefix watcher-api --ini $WATCHER_UWSGI_CONF"
    watcher_url=$service_protocol://$SERVICE_HOST/infra-optim
-    else
+    # TODO(sean-k-mooney): we should probably check that we can hit
-        watcher_url=$service_protocol://$SERVICE_HOST:$service_port
+    # the microversion endpoint and get a valid response.
        enable_apache_site watcher-api
        restart_apache_server
        # Start proxies if enabled
        if is_service_enabled tls-proxy; then
            start_tls_proxy watcher '*' $WATCHER_SERVICE_PORT $WATCHER_SERVICE_HOST $WATCHER_SERVICE_PORT_INT
        fi
    fi
    echo "Waiting for watcher-api to start..."
    if ! wait_for_service $SERVICE_TIMEOUT $watcher_url; then
        die $LINENO "watcher-api did not start"
@@ -319,17 +261,25 @@ function start_watcher {
 # stop_watcher() - Stop running processes (non-screen)
 function stop_watcher {
    if [[ "$WATCHER_USE_WSGI_MODE" == "uwsgi" ]]; then
    stop_process watcher-api
    else
        disable_apache_site watcher-api
        restart_apache_server
    fi
    for serv in watcher-decision-engine watcher-applier; do
        stop_process $serv
    done
 }
 # configure_tempest_for_watcher() - Configure Tempest for watcher
 function configure_tempest_for_watcher {
    # Set default microversion for watcher-tempest-plugin
    # Please make sure to update this when the microversion is updated, otherwise
    # new tests may be skipped.
    TEMPEST_WATCHER_MIN_MICROVERSION=${TEMPEST_WATCHER_MIN_MICROVERSION:-"1.0"}
    TEMPEST_WATCHER_MAX_MICROVERSION=${TEMPEST_WATCHER_MAX_MICROVERSION:-"1.4"}
    # Set microversion options in tempest.conf
    iniset $TEMPEST_CONFIG optimize min_microversion $TEMPEST_WATCHER_MIN_MICROVERSION
    iniset $TEMPEST_CONFIG optimize max_microversion $TEMPEST_WATCHER_MAX_MICROVERSION
 }
 # Restore xtrace
 $_XTRACE_WATCHER
--- a/devstack/plugin.sh
+++ b/devstack/plugin.sh
@@ -36,6 +36,9 @@ if is_service_enabled watcher-api watcher-decision-engine watcher-applier; then
        # Start the watcher components
        echo_summary "Starting watcher"
        start_watcher
    elif [[ "$1" == "stack" && "$2" == "test-config" ]]; then
        echo_summary "Configuring tempest for watcher"
        configure_tempest_for_watcher
    fi
    if [[ "$1" == "unstack" ]]; then
--- a/doc/source/datasources/prometheus.rst
+++ b/doc/source/datasources/prometheus.rst
@@ -29,19 +29,19 @@ This default can be overridden when a deployer uses a different label to
 identify the exporter host (for example ``hostname`` or ``host``, or any other
 label, as long as it identifies the host).
-Internally this label is used in creating a ``fqdn_instance_map``, mapping
+Internally this label is used in creating ``fqdn_instance_labels``, containing
-the fqdn with the Prometheus instance label associated with each exporter.
+the list of values assigned to the the label in the Prometheus targets.
-The keys of the resulting fqdn_instance_map are expected to match the
+The elements of the resulting fqdn_instance_labels are expected to match the
 ``ComputeNode.hostname`` used in the Watcher decision engine cluster model.
-An example ``fqdn_instance_map`` is the following:
+An example ``fqdn_instance_labels`` is the following:
 .. code-block::
-    {
+    [
-     'ena.controlplane.domain': '10.1.2.1:9100',
+     'ena.controlplane.domain',
-     'dio.controlplane.domain': '10.1.2.2:9100',
+     'dio.controlplane.domain',
-     'tria.controlplane.domain': '10.1.2.3:9100'
+     'tria.controlplane.domain',
-    }
+    ]
 For instance metrics, it is required that Prometheus contains a label
 with the uuid of the OpenStack instance in each relevant metric. By default,
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
 [build-system]
 requires = ["pbr>=6.0.0", "setuptools>=64.0.0"]
 build-backend = "pbr.build"
--- a/releasenotes/notes/add-wsgi-module-support-597f479e31979270.yaml
+++ b/releasenotes/notes/add-wsgi-module-support-597f479e31979270.yaml
@@ -0,0 +1,30 @@
 ---
 features:
  - |
    A new module, ``watcher.wsgi``, has been added as a place to gather WSGI
    ``application`` objects. This is intended to ease deployment by providing
    a consistent location for these objects. For example, if using uWSGI then
    instead of:
    .. code-block:: ini
       [uwsgi]
        wsgi-file = /bin/watcher-api-wsgi
    You can now use:
    .. code-block:: ini
        [uwsgi]
        module = watcher.wsgi.api:application
    This also simplifies deployment with other WSGI servers that expect module
    paths such as gunicorn.
 deprecations:
  - |
    The watcher-api-wsgi console script is deprecated for removal
    in a future release. This artifact is generated using a setup-tools
    extension that is provide by PBR which is also deprecated.
    due to the changes in python packaging this custom extensions
    is planned to be removed form all OpenStack projects in a future
    PBR release in favor of module based wsgi applications entry points.
--- a/releasenotes/notes/bug-2103451-fixes-prometheus-queries-with-multiple-target-0e65d20711d1abe2.yaml
+++ b/releasenotes/notes/bug-2103451-fixes-prometheus-queries-with-multiple-target-0e65d20711d1abe2.yaml
@@ -0,0 +1,8 @@
 ---
 fixes:
  - |
    When using prometheus datasource and more that one target has the same value
    for the ``fqdn_label``, the driver used the wrong instance label to query for host
    metrics. The ``instance`` label is no longer used in the queries but the ``fqdn_label``
    which identifies all the metrics for a specific compute node.
    see Bug 2103451: https://bugs.launchpad.net/watcher/+bug/2103451 for more info.
--- a/releasenotes/notes/bug-2110947.yaml
+++ b/releasenotes/notes/bug-2110947.yaml
@@ -0,0 +1,10 @@
 ---
 fixes:
  - |
     Previously, when users attempted to create a new audit without providing
     a name and a goal or an audit template, the API returned error 500 and an
     incorrect error message was displayed.
     Now, Watcher displays a helpful message and returns HTTP error 400.
     For more info see: https://bugs.launchpad.net/watcher/+bug/2110947
--- a/releasenotes/notes/bug-2112187-763bae283e0b736d.yaml
+++ b/releasenotes/notes/bug-2112187-763bae283e0b736d.yaml
@@ -0,0 +1,47 @@
 ---
 security:
  - |
    Watchers no longer forges requests on behalf of a tenant when
    swapping volumes. Prior to this release watcher had 2 implementations
    of moving a volume, it could use cinders volume migrate api or its own
    internal implementation that directly calls nova volume attachment update
    api. The former is safe and the recommend way to move volumes between
    cinder storage backend the internal implementation was insecure, fragile
    due to a lack of error handling and capable of deleting user data.
    Insecure: the internal volume migration operation created a new keystone
    user with a weak name and password and added it to the tenants project
    with the admin role. It then used that user to forge request on behalf
    of the tenant with admin right to swap the volume. if the applier was
    restarted during the execution of this operation it would never be cleaned
    up.
    Fragile: the error handling was minimal, the swap volume api is async
    so watcher has to poll for completion, there was no support to resume
    that if interrupted of the time out was exceeded.
    Data-loss: while the internal polling logic returned success or failure
    watcher did not check the result, once the function returned it
    unconditionally deleted the source volume. For larger volumes this
    could result in irretrievable data loss.
    Finally if a volume was swapped using the internal workflow it put
    the nova instance in an out of sync state. If the VM was live migrated
    after the swap volume completed successfully prior to a hard reboot
    then the migration would fail or succeed and break tenant isolation.
    see: https://bugs.launchpad.net/nova/+bug/2112187 for details.
 fixes:
  - |
    All code related to creating keystone user and granting roles has been
    removed. The internal swap volume implementation has been removed and
    replaced by cinders volume migrate api. Note as part of this change
    Watcher will no longer attempt volume migrations or retypes if the
    instance is in the `Verify Resize` task state. This resolves several
    issues related to volume migration in the zone migration and
    Storage capacity balance strategies. While efforts have been made
    to maintain backward compatibility these changes are required to
    address a security weakness in watcher's prior approach.
    see: https://bugs.launchpad.net/nova/+bug/2112187 for more context.
--- a/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
+++ b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
@@ -0,0 +1,14 @@
 ---
 fixes:
  - |
    When running an audit with the `workload_stabilization` strategy with
    `instance_ram_usage` metric in a deployment with prometheus datasource,
    the host metric for the ram usage was wrongly reported with the incorrect
    unit which lead to incorrect standard deviation and action plans due to the
    application of the wrong scale factor in the algorithm.
    The host ram usage metric is now properly reported in KB when using a
    prometheus datasource and the strategy `workload_stabilization` calculates
    the standard deviation properly.
    For more details: https://launchpad.net/bugs/2113776
--- a/releasenotes/notes/donot-run-host-migration-strategy-on-disabled-hosts-24084a22d4c8f914.yaml
+++ b/releasenotes/notes/donot-run-host-migration-strategy-on-disabled-hosts-24084a22d4c8f914.yaml
@@ -0,0 +1,10 @@
 ---
 fixes:
  - |
    Host maintenance strategy should migrate servers based on backup node if specified
    or rely on nova scheduler. It was enabling disabled hosts with watcher_disabled
    reason and migrating servers to those nodes. It can impact customer workload. Compute
    nodes were disabled for a reason.
    Host maintenance strategy is fixed now to support migrating servers only on backup
    node or rely on nova scheduler if no backup node is provided.
--- a/releasenotes/notes/fix-action-plan-state-on-failure-69e498d902ada5c5.yaml
+++ b/releasenotes/notes/fix-action-plan-state-on-failure-69e498d902ada5c5.yaml
@@ -0,0 +1,13 @@
 ---
 fixes:
  - |
    Previously, if an action failed in an action plan, the state of the
    action plan was reported as SUCCEEDED if the execution of the action has
    finished regardless of the outcome.
    Watcher will now reflect the actual state of all the actions in the plan
    after the execution has finished. If any action has status FAILED, it
    will set the state of the action plan as FAILED. This is the expected
    behavior according to Watcher documentation.
    For more info see: https://bugs.launchpad.net/watcher/+bug/2106407
--- a/releasenotes/notes/return-error-400-on-bad-parameters-bb964e4f5cadc15c.yaml
+++ b/releasenotes/notes/return-error-400-on-bad-parameters-bb964e4f5cadc15c.yaml
@@ -0,0 +1,7 @@
 ---
 fixes:
  - |
    `Bug #2110538 <https://bugs.launchpad.net/watcher/+bug/2110538>`_:
    Corrected the HTTP error code returned when watcher users try to create
    audits with invalid parameters. The API now correctly returns a 400 Bad
    Request error.
--- a/setup.cfg
+++ b/setup.cfg
@@ -6,6 +6,7 @@ description_file =
 author = OpenStack
 author_email = openstack-discuss@lists.openstack.org
 home_page = https://docs.openstack.org/watcher/latest/
 # TODO(sean-k-mooney): bump to >= 3.10 before m3.
 python_requires = >=3.9
 classifier =
    Environment :: OpenStack
@@ -17,7 +18,6 @@ classifier =
    Programming Language :: Python :: Implementation :: CPython
    Programming Language :: Python :: 3 :: Only
    Programming Language :: Python :: 3
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
    Programming Language :: Python :: 3.12
--- a/tox.ini
+++ b/tox.ini
@@ -8,7 +8,7 @@ basepython = python3
 usedevelop = True
 allowlist_externals = find
                      rm
-install_command = pip install -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master} {opts} {packages}
+install_command = pip install -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/2025.1} {opts} {packages}
 setenv =
  VIRTUAL_ENV={envdir}
  OS_STDOUT_CAPTURE=1
@@ -106,8 +106,10 @@ commands =
  make -C doc/build/pdf
 [testenv:releasenotes]
-deps = -r{toxinidir}/doc/requirements.txt
+deps = {[testenv:docs]deps}
-commands = sphinx-build -a -W -E -d releasenotes/build/doctrees --keep-going -b html releasenotes/source releasenotes/build/html
+commands =
  rm -rf releasenotes/build
  sphinx-build -W --keep-going -b html -j auto releasenotes/source releasenotes/build/html
 [testenv:bandit]
 skip_install = true
@@ -146,8 +148,3 @@ extension =
   N342 = checks:no_redundant_import_alias
   N366 = checks:import_stock_mock
 paths = ./watcher/hacking
 [doc8]
 extension=.rst
 # todo: stop ignoring doc/source/man when https://bugs.launchpad.net/doc8/+bug/1502391 is fixed
 ignore-path=doc/source/image_src,doc/source/man,doc/source/api
--- a/watcher/api/controllers/v1/audit.py
+++ b/watcher/api/controllers/v1/audit.py
@@ -33,6 +33,7 @@ import datetime
 from dateutil import tz
 from http import HTTPStatus
 import jsonschema
 from oslo_log import log
 from oslo_utils import timeutils
 import pecan
@@ -114,6 +115,11 @@ class AuditPostType(wtypes.Base):
        if self.audit_type not in audit_type_values:
            raise exception.AuditTypeNotFound(audit_type=self.audit_type)
        if not self.audit_template_uuid and not self.goal:
            message = _(
                'A valid goal or audit_template_id must be provided')
            raise exception.Invalid(message)
        if (self.audit_type == objects.audit.AuditType.ONESHOT.value and
                self.interval not in (wtypes.Unset, None)):
            raise exception.AuditIntervalNotAllowed(audit_type=self.audit_type)
@@ -612,11 +618,6 @@ class AuditsController(rest.RestController):
        if self.from_audits:
            raise exception.OperationNotPermitted
        if not audit._goal_uuid:
            raise exception.Invalid(
                message=_('A valid goal_id or audit_template_id '
                          'must be provided'))
        strategy_uuid = audit.strategy_uuid
        no_schema = True
        if strategy_uuid is not None:
@@ -627,8 +628,12 @@ class AuditsController(rest.RestController):
            if schema:
                # validate input parameter with default value feedback
                no_schema = False
-                utils.StrictDefaultValidatingDraft4Validator(schema).validate(
+                try:
-                    audit.parameters)
+                    utils.StrictDefaultValidatingDraft4Validator(
                        schema).validate(audit.parameters)
                except jsonschema.exceptions.ValidationError as e:
                    raise exception.Invalid(
                        _('Invalid parameters for strategy: %s') % e)
        if no_schema and audit.parameters:
            raise exception.Invalid(_('Specify parameters but no predefined '
--- a/watcher/applier/action_plan/default.py
+++ b/watcher/applier/action_plan/default.py
@@ -56,12 +56,30 @@ class DefaultActionPlanHandler(base.BaseActionPlanHandler):
            applier = default.DefaultApplier(self.ctx, self.service)
            applier.execute(self.action_plan_uuid)
-            action_plan.state = objects.action_plan.State.SUCCEEDED
+            # If any action has failed the action plan should be FAILED
            # Define default values for successful execution
            ap_state = objects.action_plan.State.SUCCEEDED
            notification_kwargs = {
                'phase': fields.NotificationPhase.END
            }
            failed_filter = {'action_plan_uuid': self.action_plan_uuid,
                             'state': objects.action.State.FAILED}
            failed_actions = objects.Action.list(
                self.ctx, filters=failed_filter, eager=True)
            if failed_actions:
                ap_state = objects.action_plan.State.FAILED
                notification_kwargs = {
                    'phase': fields.NotificationPhase.ERROR,
                    'priority': fields.NotificationPriority.ERROR
                }
            action_plan.state = ap_state
            action_plan.save()
            notifications.action_plan.send_action_notification(
                self.ctx, action_plan,
                action=fields.NotificationAction.EXECUTION,
-                phase=fields.NotificationPhase.END)
+                **notification_kwargs)
        except exception.ActionPlanCancelled as e:
            LOG.exception(e)
--- a/watcher/applier/actions/volume_migration.py
+++ b/watcher/applier/actions/volume_migration.py
@@ -17,14 +17,11 @@ import jsonschema
 from oslo_log import log
 from cinderclient import client as cinder_client
 from watcher._i18n import _
 from watcher.applier.actions import base
 from watcher.common import cinder_helper
 from watcher.common import exception
 from watcher.common import keystone_helper
 from watcher.common import nova_helper
 from watcher.common import utils
 from watcher import conf
 CONF = conf.CONF
@@ -70,8 +67,6 @@ class VolumeMigrate(base.BaseAction):
    def __init__(self, config, osc=None):
        super(VolumeMigrate, self).__init__(config)
        self.temp_username = utils.random_string(10)
        self.temp_password = utils.random_string(10)
        self.cinder_util = cinder_helper.CinderHelper(osc=self.osc)
        self.nova_util = nova_helper.NovaHelper(osc=self.osc)
@@ -134,83 +129,42 @@ class VolumeMigrate(base.BaseAction):
    def _can_swap(self, volume):
        """Judge volume can be swapped"""
        # TODO(sean-k-mooney): rename this to _can_migrate and update
        # tests to reflect that.
        # cinder volume migration can migrate volumes that are not
        # attached to instances or nova can migrate the data for cinder
        # if the volume is in-use. If the volume has no attachments
        # allow cinder to decided if it can be migrated.
        if not volume.attachments:
-            return False
+            LOG.debug(f"volume: {volume.id} has no attachments")
        instance_id = volume.attachments[0]['server_id']
        instance_status = self.nova_util.find_instance(instance_id).status
        if (volume.status == 'in-use' and
                instance_status in ('ACTIVE', 'PAUSED', 'RESIZED')):
            return True
-        return False
+        # since it has attachments we need to validate nova's constraints
-
+        instance_id = volume.attachments[0]['server_id']
-    def _create_user(self, volume, user):
+        instance_status = self.nova_util.find_instance(instance_id).status
-        """Create user with volume attribute and user information"""
+        LOG.debug(
-        keystone_util = keystone_helper.KeystoneHelper(osc=self.osc)
+            f"volume: {volume.id} is attached to instance: {instance_id} "
-        project_id = getattr(volume, 'os-vol-tenant-attr:tenant_id')
+            f"in instance status: {instance_status}")
-        user['project'] = project_id
+        # NOTE(sean-k-mooney): This used to allow RESIZED which
-        user['domain'] = keystone_util.get_project(project_id).domain_id
+        # is the resize_verify task state, that is not an acceptable time
-        user['roles'] = ['admin']
+        # to migrate volumes, if nova does not block this in the API
-        return keystone_util.create_user(user)
+        # today that is probably a bug. PAUSED is also questionable but
-
+        # it should generally be safe.
-    def _get_cinder_client(self, session):
+        return (volume.status == 'in-use' and
-        """Get cinder client by session"""
+                instance_status in ('ACTIVE', 'PAUSED'))
        return cinder_client.Client(
            CONF.cinder_client.api_version,
            session=session,
            endpoint_type=CONF.cinder_client.endpoint_type)
    def _swap_volume(self, volume, dest_type):
        """Swap volume to dest_type
           Limitation note: only for compute libvirt driver
        """
        if not dest_type:
            raise exception.Invalid(
                message=(_("destination type is required when "
                           "migration type is swap")))
    def _migrate(self, volume_id, dest_node, dest_type):
        try:
            volume = self.cinder_util.get_volume(volume_id)
            # for backward compatibility map swap to migrate.
            if self.migration_type in (self.SWAP, self.MIGRATE):
                if not self._can_swap(volume):
                    raise exception.Invalid(
                        message=(_("Invalid state for swapping volume")))
-
+                return self.cinder_util.migrate(volume, dest_node)
        user_info = {
            'name': self.temp_username,
            'password': self.temp_password}
        user = self._create_user(volume, user_info)
        keystone_util = keystone_helper.KeystoneHelper(osc=self.osc)
        try:
            session = keystone_util.create_session(
                user.id, self.temp_password)
            temp_cinder = self._get_cinder_client(session)
            # swap volume
            new_volume = self.cinder_util.create_volume(
                temp_cinder, volume, dest_type)
            self.nova_util.swap_volume(volume, new_volume)
            # delete old volume
            self.cinder_util.delete_volume(volume)
        finally:
            keystone_util.delete_user(user)
        return True
    def _migrate(self, volume_id, dest_node, dest_type):
        try:
            volume = self.cinder_util.get_volume(volume_id)
            if self.migration_type == self.SWAP:
                if dest_node:
                    LOG.warning("dest_node is ignored")
                return self._swap_volume(volume, dest_type)
            elif self.migration_type == self.RETYPE:
                return self.cinder_util.retype(volume, dest_type)
            elif self.migration_type == self.MIGRATE:
                return self.cinder_util.migrate(volume, dest_node)
            else:
                raise exception.Invalid(
                    message=(_("Migration of type '%(migration_type)s' is not "
--- a/watcher/common/keystone_helper.py
+++ b/watcher/common/keystone_helper.py
@@ -15,8 +15,6 @@
 from oslo_log import log
 from keystoneauth1.exceptions import http as ks_exceptions
 from keystoneauth1 import loading
 from keystoneauth1 import session
 from watcher._i18n import _
 from watcher.common import clients
 from watcher.common import exception
@@ -90,35 +88,3 @@ class KeystoneHelper(object):
                    message=(_("Domain name seems ambiguous: %s") %
                             name_or_id))
            return domains[0]
    def create_session(self, user_id, password):
        user = self.get_user(user_id)
        loader = loading.get_plugin_loader('password')
        auth = loader.load_from_options(
            auth_url=CONF.watcher_clients_auth.auth_url,
            password=password,
            user_id=user_id,
            project_id=user.default_project_id)
        return session.Session(auth=auth)
    def create_user(self, user):
        project = self.get_project(user['project'])
        domain = self.get_domain(user['domain'])
        _user = self.keystone.users.create(
            user['name'],
            password=user['password'],
            domain=domain,
            project=project,
        )
        for role in user['roles']:
            role = self.get_role(role)
            self.keystone.roles.grant(
                role.id, user=_user.id, project=project.id)
        return _user
    def delete_user(self, user):
        try:
            user = self.get_user(user)
            self.keystone.users.delete(user)
        except exception.Invalid:
            pass
--- a/watcher/common/utils.py
+++ b/watcher/common/utils.py
@@ -19,9 +19,7 @@
 import asyncio
 import datetime
 import inspect
 import random
 import re
 import string
 from croniter import croniter
 import eventlet
@@ -160,14 +158,10 @@ def extend_with_strict_schema(validator_class):
 StrictDefaultValidatingDraft4Validator = extend_with_default(
    extend_with_strict_schema(validators.Draft4Validator))
 Draft4Validator = validators.Draft4Validator
 def random_string(n):
    return ''.join([random.choice(
        string.ascii_letters + string.digits) for i in range(n)])
 # Some clients (e.g. MAAS) use asyncio, which isn't compatible with Eventlet.
 # As a workaround, we're delegating such calls to a native thread.
 def async_compat_call(f, *args, **kwargs):
--- a/watcher/decision_engine/datasources/base.py
+++ b/watcher/decision_engine/datasources/base.py
@@ -178,7 +178,7 @@ class DataSourceBase(object):
                           granularity=None):
        """Get the ram usage for a host such as a compute_node
-        :return: ram usage as float in megabytes
+        :return: ram usage as float in kibibytes
        """
        pass
--- a/watcher/decision_engine/datasources/prometheus.py
+++ b/watcher/decision_engine/datasources/prometheus.py
@@ -51,10 +51,10 @@ class PrometheusHelper(base.DataSourceBase):
        The prometheus helper uses the PrometheusAPIClient provided by
        python-observabilityclient.
-        The prometheus_fqdn_instance_map maps the fqdn of each node to the
+        The prometheus_fqdn_labels contains a list the values contained in
-        Prometheus instance label added to all metrics on that node. When
+        the fqdn_label in the Prometheus instance. When making queries to
-        making queries to Prometheus we use the instance label to specify
+        Prometheus we use the fqdn_label to specify the node for which
-        the node for which metrics are to be retrieved.
+         metrics are to be retrieved.
        host, port and fqdn_label come from watcher_client
        config. The prometheus_fqdn_label allows override of the required label
        in Prometheus scrape configs that specifies each target's fqdn.
@@ -63,8 +63,8 @@ class PrometheusHelper(base.DataSourceBase):
        self.prometheus_fqdn_label = (
            CONF.prometheus_client.fqdn_label
        )
-        self.prometheus_fqdn_instance_map = (
+        self.prometheus_fqdn_labels = (
-            self._build_prometheus_fqdn_instance_map()
+            self._build_prometheus_fqdn_labels()
        )
        self.prometheus_host_instance_map = (
            self._build_prometheus_host_instance_map()
@@ -136,73 +136,71 @@ class PrometheusHelper(base.DataSourceBase):
        return the_client
-    def _build_prometheus_fqdn_instance_map(self):
+    def _build_prometheus_fqdn_labels(self):
-        """Build the fqdn<-->instance_label mapping needed for queries
+        """Build the list of fqdn_label values to be used in host queries
        Watcher knows nodes by their hostname. In Prometheus however the
        scrape targets (also known as 'instances') are specified by I.P.
-        (or hostname) and port number. This function creates a mapping between
+        (or hostname) and port number and fqdn is stored in a custom 'fqdn'
-        the fully qualified domain name of each node and the corresponding
+        label added to Prometheus scrape_configs. Operators can use a
-        instance label used in the scrape config. This relies on a custom
+        different custom label instead by setting the prometheus_fqdn_label
        'fqdn' label added to Prometheus scrape_configs. Operators can use
        a different custom label instead by setting the prometheus_fqdn_label
        config option under the prometheus_client section of watcher config.
-        The built prometheus_fqdn_instance_map is used to match watcher
+        The built prometheus_fqdn_labels is created with the full list
-        node.hostname if watcher stores fqdn and otherwise the
+        of values of the prometheus_fqdn_label label in Prometheus. This will
-        host_instance_map is used instead.
+        be used to create a map of hostname<-->fqdn and to identify if a target
-        :return a dict mapping fqdn to instance label. For example:
+        exist in prometheus for the compute nodes before sending the query.
-                {'marios-env-again.controlplane.domain': '10.1.2.3:9100'}
+        :return a set of values of the fqdn label. For example:
                {'foo.example.com', 'bar.example.com'}
                {'foo', 'bar'}
        """
        prometheus_targets = self.prometheus._get(
            "targets?state=active")['data']['activeTargets']
        # >>> prometheus_targets[0]['labels']
        # {'fqdn': 'marios-env-again.controlplane.domain',
        #  'instance': 'localhost:9100', 'job': 'node'}
-        fqdn_instance_map = {
+        fqdn_instance_labels = set()
-            fqdn: instance for (fqdn, instance) in (
+        for target in prometheus_targets:
-                (target['labels'].get(self.prometheus_fqdn_label),
+            if target.get('labels', {}).get(self.prometheus_fqdn_label):
-                 target['labels'].get('instance'))
+                fqdn_instance_labels.add(
-                for target in prometheus_targets
+                    target['labels'].get(self.prometheus_fqdn_label))
-                if target.get('labels', {}).get(self.prometheus_fqdn_label)
+
-            )
+        if not fqdn_instance_labels:
        }
        if not fqdn_instance_map:
            LOG.error(
-                "Could not create fqdn instance map from Prometheus "
+                "Could not create fqdn labels list from Prometheus "
                "targets config. Prometheus returned the following: %s",
                prometheus_targets
            )
-            return {}
+            return set()
-        return fqdn_instance_map
+        return fqdn_instance_labels
    def _build_prometheus_host_instance_map(self):
        """Build the hostname<-->instance_label mapping needed for queries
-        The prometheus_fqdn_instance_map has the fully qualified domain name
+        The prometheus_fqdn_labels has the fully qualified domain name
        for hosts. This will create a duplicate map containing only the host
        name part. Depending on the watcher node.hostname either the
-        fqdn_instance_map or the host_instance_map will be used to resolve
+        fqdn_instance_labels or the host_instance_map will be used to resolve
-        the correct prometheus instance label for queries. In the event the
+        the correct prometheus fqdn_label for queries. In the event the
-        fqdn_instance_map keys are not valid fqdn (for example it contains
+        fqdn_instance_labels elements are not valid fqdn (for example it has
        hostnames, not fqdn) the host_instance_map cannot be created and
        an empty dictionary is returned with a warning logged.
        :return a dict mapping hostname to instance label. For example:
-                {'marios-env-again': 'localhost:9100'}
+                {'foo': 'foo.example.com', 'bar': 'bar.example.com'}
        """
-        if not self.prometheus_fqdn_instance_map:
+        if not self.prometheus_fqdn_labels:
            LOG.error("Cannot build host_instance_map without "
-                      "fqdn_instance_map")
+                      "fqdn_instance_labels")
            return {}
        host_instance_map = {
-            host: instance for (host, instance) in (
+            host: fqdn for (host, fqdn) in (
-                (fqdn.split('.')[0], inst)
+                (fqdn.split('.')[0], fqdn)
-                for fqdn, inst in self.prometheus_fqdn_instance_map.items()
+                for fqdn in self.prometheus_fqdn_labels
                if '.' in fqdn
            )
        }
        if not host_instance_map:
            LOG.warning("Creating empty host instance map. Are the keys "
-                        "in prometheus_fqdn_instance_map valid fqdn?")
+                        "in prometheus_fqdn_labels valid fqdn?")
            return {}
        return host_instance_map
@@ -210,23 +208,25 @@ class PrometheusHelper(base.DataSourceBase):
        """Resolve the prometheus instance label to use in queries
        Given the watcher node.hostname, resolve the prometheus instance
-        label for use in queries, first trying the fqdn_instance_map and
+        label for use in queries, first trying the fqdn_instance_labels and
        then the host_instance_map (watcher.node_name can be fqdn or hostname).
        If the name is not resolved after the first attempt, rebuild the fqdn
        and host instance maps and try again. This allows for new hosts added
-        after the initialisation of the fqdn_instance_map.
+        after the initialisation of the fqdn_instance_labels.
        :param node_name: the watcher node.hostname
        :return String for the prometheus instance label and None if not found
        """
        def _query_maps(node):
-            return self.prometheus_fqdn_instance_map.get(
+            if node in self.prometheus_fqdn_labels:
-                node, self.prometheus_host_instance_map.get(node, None))
+                return node
            else:
                return self.prometheus_host_instance_map.get(node, None)
        instance_label = _query_maps(node_name)
        # refresh the fqdn and host instance maps and retry
        if not instance_label:
-            self.prometheus_fqdn_instance_map = (
+            self.prometheus_fqdn_labels = (
-                self._build_prometheus_fqdn_instance_map()
+                self._build_prometheus_fqdn_labels()
            )
            self.prometheus_host_instance_map = (
                self._build_prometheus_host_instance_map()
@@ -264,8 +264,8 @@ class PrometheusHelper(base.DataSourceBase):
        This function builds and returns the string query that will be sent
        to the Prometheus server /query endpoint. For host cpu usage we use:
-        100 - (avg by (instance)(rate(node_cpu_seconds_total{mode='idle',
+        100 - (avg by (fqdn)(rate(node_cpu_seconds_total{mode='idle',
-                                       instance='some_host'}[300s])) * 100)
+                                       fqdn='some_host'}[300s])) * 100)
        so using prometheus rate function over the specified period, we average
        per instance (all cpus) idle time and then 'everything else' is cpu
@@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase):
        (node_memory_MemTotal_bytes{instance='the_host'} -
        avg_over_time(
            node_memory_MemAvailable_bytes{instance='the_host'}[300s]))
-            / 1024 / 1024
+            / 1024
        So we take total and subtract available memory to determine
        how much is in use. We use the prometheus xxx_over_time functions
@@ -307,17 +307,22 @@ class PrometheusHelper(base.DataSourceBase):
        if meter == 'node_cpu_seconds_total':
            query_args = (
-                "100 - (%s by (instance)(rate(%s"
+                "100 - (%(agg)s by (%(label)s)(rate(%(meter)s"
-                "{mode='idle',instance='%s'}[%ss])) * 100)" %
+                "{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) "
-                (aggregate, meter, instance_label, period)
+                "* 100)"
                % {'label': self.prometheus_fqdn_label,
                   'label_value': instance_label, 'agg': aggregate,
                   'meter': meter, 'period': period}
            )
        elif meter == 'node_memory_MemAvailable_bytes':
            # Prometheus metric is in B and we need to return KB
            query_args = (
-                "(node_memory_MemTotal_bytes{instance='%s'} "
+                "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
-                "- %s_over_time(%s{instance='%s'}[%ss])) "
+                "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
-                "/ 1024 / 1024" %
+                "[%(period)ss])) / 1024"
-                (instance_label, aggregate, meter,
+                % {'label': self.prometheus_fqdn_label,
-                 instance_label, period)
+                   'label_value': instance_label, 'agg': aggregate,
                   'meter': meter, 'period': period}
            )
        elif meter == 'ceilometer_memory_usage':
            query_args = (
@@ -338,10 +343,12 @@ class PrometheusHelper(base.DataSourceBase):
                )
                vcpus = 1
            query_args = (
-                "clamp_max((%s by (instance)(rate(%s{%s='%s'}[%ss]))/10e+8) "
+                "clamp_max((%(agg)s by (%(label)s)"
-                "*(100/%s), 100)" %
+                "(rate(%(meter)s{%(label)s='%(label_value)s'}[%(period)ss]))"
-                (aggregate, meter, uuid_label_key, instance_label, period,
+                "/10e+8) *(100/%(vcpus)s), 100)"
-                 vcpus)
+                % {'label': uuid_label_key, 'label_value': instance_label,
                   'agg': aggregate, 'meter': meter, 'period': period,
                   'vcpus': vcpus}
            )
        else:
            raise exception.InvalidParameter(
@@ -460,8 +467,8 @@ class PrometheusHelper(base.DataSourceBase):
        This calculates the host cpu usage and returns it as a percentage
        The calculation is made by using the cpu 'idle' time, per
        instance (so all CPUs are included). For example the query looks like
-        (100 - (avg by (instance)(rate(node_cpu_seconds_total
+        (100 - (avg by (fqdn)(rate(node_cpu_seconds_total
-            {mode='idle',instance='localhost:9100'}[300s])) * 100))
+            {mode='idle',fqdn='compute1.example.com'}[300s])) * 100))
        """
        aggregate = self._invert_max_min_aggregate(aggregate)
        cpu_usage = self.statistic_aggregation(
--- a/watcher/decision_engine/strategy/strategies/host_maintenance.py
+++ b/watcher/decision_engine/strategy/strategies/host_maintenance.py
@@ -53,7 +53,6 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
    INSTANCE_MIGRATION = "migrate"
    CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state"
    REASON_FOR_DISABLE = 'watcher_disabled'
    def __init__(self, config, osc=None):
        super(HostMaintenance, self).__init__(config, osc)
@@ -95,10 +94,6 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
                cn.status == element.ServiceState.DISABLED.value and
                cn.disabled_reason == reason}
    def get_disabled_compute_nodes(self):
        return self.get_disabled_compute_nodes_with_reason(
            self.REASON_FOR_DISABLE)
    def get_instance_state_str(self, instance):
        """Get instance state in string format"""
        if isinstance(instance.state, str):
@@ -195,7 +190,7 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
                  'source_node': src_node.uuid,
                  'resource_name': instance.name}
        if des_node:
-            params['destination_node'] = des_node.uuid
+            params['destination_node'] = des_node.hostname
        self.solution.add_action(action_type=self.INSTANCE_MIGRATION,
                                 resource_id=instance.uuid,
                                 input_parameters=params)
@@ -215,8 +210,7 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
        """safe maintain one compute node
        Migrate all instances of the maintenance_node intensively to the
-        backup host. If the user didn't give the backup host, it will
+        backup host.
        select one unused node to backup the maintaining node.
        It calculate the resource both of the backup node and maintaining
        node to evaluate the migrations from maintaining node to backup node.
@@ -233,22 +227,6 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
                self.host_migration(maintenance_node, backup_node)
                return True
        # If the user didn't give the backup host, select one unused
        # node with required capacity, then migrates all instances
        # from maintaining node to it.
        nodes = sorted(
            self.get_disabled_compute_nodes().values(),
            key=lambda x: self.get_node_capacity(x)['cpu'])
        if maintenance_node in nodes:
            nodes.remove(maintenance_node)
        for node in nodes:
            if self.host_fits(maintenance_node, node):
                self.enable_compute_node_if_disabled(node)
                self.add_action_maintain_compute_node(maintenance_node)
                self.host_migration(maintenance_node, node)
                return True
        return False
    def try_maintain(self, maintenance_node):
--- a/watcher/decision_engine/strategy/strategies/zone_migration.py
+++ b/watcher/decision_engine/strategy/strategies/zone_migration.py
@@ -57,8 +57,19 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
        self.planned_cold_count = 0
        self.volume_count = 0
        self.planned_volume_count = 0
-        self.volume_update_count = 0
+
-        self.planned_volume_update_count = 0
+    # TODO(sean-n-mooney) This is backward compatibility
    # for calling the swap code paths. Swap is now an alias
    # for migrate, we should clean this up in a future
    # cycle.
    @property
    def volume_update_count(self):
        return self.volume_count
    # same as above clean up later.
    @property
    def planned_volume_update_count(self):
        return self.planned_volume_count
    @classmethod
    def get_name(cls):
@@ -312,8 +323,8 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
            planned_cold_migrate_instance_count=self.planned_cold_count,
            volume_migrate_count=self.volume_count,
            planned_volume_migrate_count=self.planned_volume_count,
-            volume_update_count=self.volume_update_count,
+            volume_update_count=self.volume_count,
-            planned_volume_update_count=self.planned_volume_update_count
+            planned_volume_update_count=self.planned_volume_count
        )
    def set_migration_count(self, targets):
@@ -328,10 +339,7 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
            elif self.is_cold(instance):
                self.cold_count += 1
        for volume in targets.get('volume', []):
            if self.is_available(volume):
            self.volume_count += 1
            elif self.is_in_use(volume):
                self.volume_update_count += 1
    def is_live(self, instance):
        status = getattr(instance, 'status')
@@ -404,13 +412,10 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
            LOG.debug(src_type)
            LOG.debug("%s %s", dst_pool, dst_type)
            if self.is_available(volume):
            if src_type == dst_type:
                self._volume_migrate(volume, dst_pool)
            else:
                self._volume_retype(volume, dst_type)
            elif self.is_in_use(volume):
                self._volume_update(volume, dst_type)
            # if with_attached_volume is True, migrate attaching instances
            if self.with_attached_volume:
@@ -464,16 +469,6 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
            input_parameters=parameters)
        self.planned_cold_count += 1
    def _volume_update(self, volume, dst_type):
        parameters = {"migration_type": "swap",
                      "destination_type": dst_type,
                      "resource_name": volume.name}
        self.solution.add_action(
            action_type="volume_migrate",
            resource_id=volume.id,
            input_parameters=parameters)
        self.planned_volume_update_count += 1
    def _volume_migrate(self, volume, dst_pool):
        parameters = {"migration_type": "migrate",
                      "destination_node": dst_pool,
--- a/watcher/tests/api/v1/test_audits.py
+++ b/watcher/tests/api/v1/test_audits.py
@@ -827,17 +827,41 @@ class TestPost(api_base.FunctionalTest):
        self.assertIn(expected_error_msg, response.json['error_message'])
        assert not mock_trigger_audit.called
    @mock.patch.object(deapi.DecisionEngineAPI, 'trigger_audit')
    def test_create_audit_with_missing_parameter(
            self, mock_trigger_audit):
        mock_trigger_audit.return_value = mock.ANY
        audit_template = self.prepare_audit_template_strategy_with_parameter()
        audit_dict = api_utils.audit_post_data(
            parameters={})
        audit_dict['audit_template_uuid'] = audit_template['uuid']
        del_keys = ['uuid', 'goal_id', 'strategy_id', 'state', 'interval',
                    'scope', 'next_run_time', 'hostname']
        for k in del_keys:
            del audit_dict[k]
        response = self.post_json('/audits', audit_dict, expect_errors=True)
        self.assertEqual(HTTPStatus.BAD_REQUEST, response.status_int)
        self.assertEqual("application/json", response.content_type)
        expected_error_msg = (
            "Invalid parameters for strategy: 'fake1' is a required property")
        self.assertTrue(response.json['error_message'])
        self.assertIn(expected_error_msg, response.json['error_message'])
        assert not mock_trigger_audit.called
    def prepare_audit_template_strategy_with_parameter(self):
        fake_spec = {
            "properties": {
                "fake1": {
                    "description": "number parameter example",
                    "type": "number",
                    "default": 3.2,
                    "minimum": 1.0,
                    "maximum": 10.2,
                }
-            }
+            },
            'required': ['fake1']
        }
        template_uuid = 'e74c40e0-d825-11e2-a28f-0800200c9a67'
        strategy_uuid = 'e74c40e0-d825-11e2-a28f-0800200c9a68'
@@ -987,6 +1011,27 @@ class TestPost(api_base.FunctionalTest):
        self.assertEqual(HTTPStatus.CREATED, response.status_int)
        self.assertTrue(response.json['force'])
    @mock.patch.object(deapi.DecisionEngineAPI, 'trigger_audit')
    def test_create_audit_with_no_goal_no_name(self, mock_trigger_audit):
        mock_trigger_audit.return_value = mock.ANY
        audit_dict = post_get_test_audit(
            params_to_exclude=['uuid', 'state', 'interval', 'scope',
                               'next_run_time', 'hostname', 'goal',
                               'audit_template_uuid', 'name'])
        response = self.post_json(
            '/audits',
            audit_dict,
            expect_errors=True,
            headers={'OpenStack-API-Version': 'infra-optim 1.2'})
        self.assertEqual('application/json', response.content_type)
        self.assertEqual(HTTPStatus.BAD_REQUEST, response.status_int)
        expected_msg = 'A valid goal or audit_template_id must be provided'
        self.assertTrue(response.json['error_message'])
        self.assertIn(expected_msg, response.json['error_message'])
        assert not mock_trigger_audit.called
 class TestDelete(api_base.FunctionalTest):
--- a/watcher/tests/applier/action_plan/test_default_action_handler.py
+++ b/watcher/tests/applier/action_plan/test_default_action_handler.py
@@ -124,3 +124,30 @@ class TestDefaultActionPlanHandler(base.DbTestCase):
            self.context, mock.MagicMock(), self.action_plan.uuid)
        command.execute()
        self.assertEqual(ap_objects.State.CANCELLED, self.action_plan.state)
    @mock.patch.object(objects.ActionPlan, "get_by_uuid")
    @mock.patch.object(objects.Action, "list")
    def test_launch_action_plan_failed_actions(self, m_action_list,
                                               m_get_action_plan):
        m_get_action_plan.return_value = self.action_plan
        failed_action = self.action
        failed_action.state = objects.action.State.FAILED
        m_action_list.return_value = [failed_action]
        command = default.DefaultActionPlanHandler(
            self.context, mock.MagicMock(), self.action_plan.uuid)
        command.execute()
        expected_calls = [
            mock.call(self.context, self.action_plan,
                      action=objects.fields.NotificationAction.EXECUTION,
                      phase=objects.fields.NotificationPhase.START),
            mock.call(self.context, self.action_plan,
                      action=objects.fields.NotificationAction.EXECUTION,
                      priority=objects.fields.NotificationPriority.ERROR,
                      phase=objects.fields.NotificationPhase.ERROR)]
        self.assertEqual(ap_objects.State.FAILED, self.action_plan.state)
        self.assertEqual(
            expected_calls,
            self.m_action_plan_notifications
                .send_action_notification
                .call_args_list)
--- a/watcher/tests/applier/actions/test_volume_migration.py
+++ b/watcher/tests/applier/actions/test_volume_migration.py
@@ -22,7 +22,6 @@ from watcher.common import cinder_helper
 from watcher.common import clients
 from watcher.common import keystone_helper
 from watcher.common import nova_helper
 from watcher.common import utils as w_utils
 from watcher.tests import base
@@ -102,12 +101,15 @@ class TestMigration(base.TestCase):
    @staticmethod
    def fake_volume(**kwargs):
        # FIXME(sean-k-mooney): we should be using real objects in this
        # test or at lease something more Representative of the real data
        volume = mock.MagicMock()
        volume.id = kwargs.get('id', TestMigration.VOLUME_UUID)
        volume.size = kwargs.get('size', '1')
        volume.status = kwargs.get('status', 'available')
        volume.snapshot_id = kwargs.get('snapshot_id', None)
        volume.availability_zone = kwargs.get('availability_zone', 'nova')
        volume.attachments = kwargs.get('attachments', [])
        return volume
    @staticmethod
@@ -175,42 +177,14 @@ class TestMigration(base.TestCase):
            "storage1-typename",
        )
    def test_swap_success(self):
        volume = self.fake_volume(
            status='in-use', attachments=[{'server_id': 'server_id'}])
        self.m_n_helper.find_instance.return_value = self.fake_instance()
        new_volume = self.fake_volume(id=w_utils.generate_uuid())
        user = mock.Mock()
        session = mock.MagicMock()
        self.m_k_helper.create_user.return_value = user
        self.m_k_helper.create_session.return_value = session
        self.m_c_helper.get_volume.return_value = volume
        self.m_c_helper.create_volume.return_value = new_volume
        result = self.action_swap.execute()
        self.assertTrue(result)
        self.m_n_helper.swap_volume.assert_called_once_with(
            volume,
            new_volume
        )
        self.m_k_helper.delete_user.assert_called_once_with(user)
    def test_swap_fail(self):
        # _can_swap fail
        instance = self.fake_instance(status='STOPPED')
        self.m_n_helper.find_instance.return_value = instance
        result = self.action_swap.execute()
        self.assertFalse(result)
    def test_can_swap_success(self):
        volume = self.fake_volume(
-            status='in-use', attachments=[{'server_id': 'server_id'}])
+            status='in-use', attachments=[
-        instance = self.fake_instance()
+                {'server_id': TestMigration.INSTANCE_UUID}])
        instance = self.fake_instance()
        self.m_n_helper.find_instance.return_value = instance
        result = self.action_swap._can_swap(volume)
        self.assertTrue(result)
@@ -219,16 +193,33 @@ class TestMigration(base.TestCase):
        result = self.action_swap._can_swap(volume)
        self.assertTrue(result)
        instance = self.fake_instance(status='RESIZED')
        self.m_n_helper.find_instance.return_value = instance
        result = self.action_swap._can_swap(volume)
        self.assertTrue(result)
    def test_can_swap_fail(self):
        volume = self.fake_volume(
-            status='in-use', attachments=[{'server_id': 'server_id'}])
+            status='in-use', attachments=[
                {'server_id': TestMigration.INSTANCE_UUID}])
        instance = self.fake_instance(status='STOPPED')
        self.m_n_helper.find_instance.return_value = instance
        result = self.action_swap._can_swap(volume)
        self.assertFalse(result)
        instance = self.fake_instance(status='RESIZED')
        self.m_n_helper.find_instance.return_value = instance
        result = self.action_swap._can_swap(volume)
        self.assertFalse(result)
    def test_swap_success(self):
        volume = self.fake_volume(
            status='in-use', attachments=[
                {'server_id': TestMigration.INSTANCE_UUID}])
        self.m_c_helper.get_volume.return_value = volume
        instance = self.fake_instance()
        self.m_n_helper.find_instance.return_value = instance
        result = self.action_swap.execute()
        self.assertTrue(result)
        self.m_c_helper.migrate.assert_called_once_with(
            volume,
            "storage1-poolname"
        )
--- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
@@ -146,8 +146,9 @@ class TestPrometheusHelper(base.BaseTestCase):
        )
        self.assertEqual(expected_cpu_usage, result)
        mock_prometheus_query.assert_called_once_with(
-            "100 - (avg by (instance)(rate(node_cpu_seconds_total"
+            "100 - (avg by (fqdn)(rate(node_cpu_seconds_total"
-            "{mode='idle',instance='10.0.1.2:9100'}[300s])) * 100)")
+            "{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))"
            " * 100)")
    @mock.patch.object(prometheus_client.PrometheusAPIClient, 'query')
    @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
@@ -241,7 +242,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        self.assertEqual(expected_cpu_usage, result_cpu)
        self.assertIsInstance(result_cpu, float)
        mock_prometheus_query.assert_called_once_with(
-            "clamp_max((avg by (instance)(rate("
+            "clamp_max((avg by (resource)(rate("
            "ceilometer_cpu{resource='uuid-0'}[300s]))"
            "/10e+8) *(100/2), 100)"
        )
@@ -437,15 +438,48 @@ class TestPrometheusHelper(base.BaseTestCase):
                'instance': '10.1.2.3:9100', 'job': 'node',
            }},
        ]}}
-        expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100',
+        expected_fqdn_list = {'foo.controlplane.domain',
-                             'bar.controlplane.domain': '10.1.2.2:9100',
+                              'bar.controlplane.domain',
-                             'baz.controlplane.domain': '10.1.2.3:9100'}
+                              'baz.controlplane.domain'}
-        expected_host_map = {'foo': '10.1.2.1:9100',
+        expected_host_map = {'foo': 'foo.controlplane.domain',
-                             'bar': '10.1.2.2:9100',
+                             'bar': 'bar.controlplane.domain',
-                             'baz': '10.1.2.3:9100'}
+                             'baz': 'baz.controlplane.domain'}
        helper = prometheus_helper.PrometheusHelper()
-        self.assertEqual(helper.prometheus_fqdn_instance_map,
+        self.assertEqual(helper.prometheus_fqdn_labels,
-                         expected_fqdn_map)
+                         expected_fqdn_list)
        self.assertEqual(helper.prometheus_host_instance_map,
                         expected_host_map)
    @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
    def test_build_prometheus_fqdn_host_instance_map_dupl_fqdn(
            self, mock_prometheus_get):
        mock_prometheus_get.return_value = {'data': {'activeTargets': [
            {'labels': {
                'fqdn': 'foo.controlplane.domain',
                'instance': '10.1.2.1:9100', 'job': 'node',
            }},
            {'labels': {
                'fqdn': 'foo.controlplane.domain',
                'instance': '10.1.2.1:9229', 'job': 'podman',
            }},
            {'labels': {
                'fqdn': 'bar.controlplane.domain',
                'instance': '10.1.2.2:9100', 'job': 'node',
            }},
            {'labels': {
                'fqdn': 'baz.controlplane.domain',
                'instance': '10.1.2.3:9100', 'job': 'node',
            }},
        ]}}
        expected_fqdn_list = {'foo.controlplane.domain',
                              'bar.controlplane.domain',
                              'baz.controlplane.domain'}
        expected_host_map = {'foo': 'foo.controlplane.domain',
                             'bar': 'bar.controlplane.domain',
                             'baz': 'baz.controlplane.domain'}
        helper = prometheus_helper.PrometheusHelper()
        self.assertEqual(helper.prometheus_fqdn_labels,
                         expected_fqdn_list)
        self.assertEqual(helper.prometheus_host_instance_map,
                         expected_host_map)
@@ -460,7 +494,7 @@ class TestPrometheusHelper(base.BaseTestCase):
            }},
        ]}}
        helper = prometheus_helper.PrometheusHelper()
-        self.assertEqual({}, helper.prometheus_fqdn_instance_map)
+        self.assertEqual(set(), helper.prometheus_fqdn_labels)
        self.assertEqual({}, helper.prometheus_host_instance_map)
    @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
@@ -476,12 +510,29 @@ class TestPrometheusHelper(base.BaseTestCase):
            }},
        ]}}
        helper = prometheus_helper.PrometheusHelper()
-        expected_fqdn_map = {'ena': '10.1.2.1:9100',
+        expected_fqdn_list = {'ena', 'dyo'}
                             'dyo': '10.1.2.2:9100'}
        self.assertEqual(
-            helper.prometheus_fqdn_instance_map, expected_fqdn_map)
+            helper.prometheus_fqdn_labels, expected_fqdn_list)
        self.assertEqual({}, helper.prometheus_host_instance_map)
    @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
    def test_using_ips_not_fqdn(self, mock_prometheus_get):
        mock_prometheus_get.return_value = {'data': {'activeTargets': [
            {'labels': {
                'ip_label': '10.1.2.1',
                'instance': '10.1.2.1:9100', 'job': 'node',
            }},
            {'labels': {
                'ip_label': '10.1.2.2',
                'instance': '10.1.2.2:9100', 'job': 'node',
            }},
        ]}}
        cfg.CONF.prometheus_client.fqdn_label = 'ip_label'
        helper = prometheus_helper.PrometheusHelper()
        expected_fqdn_list = {'10.1.2.1', '10.1.2.2'}
        self.assertEqual(
            helper.prometheus_fqdn_labels, expected_fqdn_list)
    @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
    def test_override_prometheus_fqdn_label(self, mock_prometheus_get):
        mock_prometheus_get.return_value = {'data': {'activeTargets': [
@@ -494,19 +545,19 @@ class TestPrometheusHelper(base.BaseTestCase):
                'instance': '10.1.2.2:9100', 'job': 'node',
            }},
        ]}}
-        expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100',
+        expected_fqdn_list = {'foo.controlplane.domain',
-                             'bar.controlplane.domain': '10.1.2.2:9100'}
+                              'bar.controlplane.domain'}
-        expected_host_map = {'foo': '10.1.2.1:9100',
+        expected_host_map = {'foo': 'foo.controlplane.domain',
-                             'bar': '10.1.2.2:9100'}
+                             'bar': 'bar.controlplane.domain'}
        cfg.CONF.prometheus_client.fqdn_label = 'custom_fqdn_label'
        helper = prometheus_helper.PrometheusHelper()
-        self.assertEqual(helper.prometheus_fqdn_instance_map,
+        self.assertEqual(helper.prometheus_fqdn_labels,
-                         expected_fqdn_map)
+                         expected_fqdn_list)
        self.assertEqual(helper.prometheus_host_instance_map,
                         expected_host_map)
    def test_resolve_prometheus_instance_label(self):
-        expected_instance_label = '10.0.1.2:9100'
+        expected_instance_label = 'marios-env.controlplane.domain'
        result = self.helper._resolve_prometheus_instance_label(
            'marios-env.controlplane.domain')
        self.assertEqual(result, expected_instance_label)
@@ -524,34 +575,53 @@ class TestPrometheusHelper(base.BaseTestCase):
    def test_build_prometheus_query_node_cpu_avg_agg(self):
        expected_query = (
-            "100 - (avg by (instance)(rate(node_cpu_seconds_total"
+            "100 - (avg by (fqdn)(rate(node_cpu_seconds_total"
-            "{mode='idle',instance='a_host'}[111s])) * 100)")
+            "{mode='idle',fqdn='a_host'}[111s])) * 100)")
        result = self.helper._build_prometheus_query(
            'avg', 'node_cpu_seconds_total', 'a_host', '111')
        self.assertEqual(result, expected_query)
    def test_build_prometheus_query_node_cpu_max_agg(self):
        expected_query = (
-            "100 - (max by (instance)(rate(node_cpu_seconds_total"
+            "100 - (max by (fqdn)(rate(node_cpu_seconds_total"
-            "{mode='idle',instance='b_host'}[444s])) * 100)")
+            "{mode='idle',fqdn='b_host'}[444s])) * 100)")
        result = self.helper._build_prometheus_query(
            'max', 'node_cpu_seconds_total', 'b_host', '444')
        self.assertEqual(result, expected_query)
    def test_build_prometheus_query_node_memory_avg_agg(self):
        expected_query = (
-            "(node_memory_MemTotal_bytes{instance='c_host'} - avg_over_time"
+            "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
-            "(node_memory_MemAvailable_bytes{instance='c_host'}[555s])) "
+            "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
        self.assertEqual(result, expected_query)
    def test_build_prometheus_query_node_memory_min_agg(self):
        expected_query = (
-            "(node_memory_MemTotal_bytes{instance='d_host'} - min_over_time"
+            "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
-            "(node_memory_MemAvailable_bytes{instance='d_host'}[222s])) "
+            "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
        self.assertEqual(result, expected_query)
    def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self):
        self.helper.prometheus_fqdn_label = 'custom_fqdn_label'
        expected_query = (
            "100 - (avg by (custom_fqdn_label)(rate(node_cpu_seconds_total"
            "{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)")
        result = self.helper._build_prometheus_query(
            'avg', 'node_cpu_seconds_total', 'a_host', '111')
        self.assertEqual(result, expected_query)
    def test_build_prometheus_query_node_memory_min_agg_custom_label(self):
        self.helper.prometheus_fqdn_label = 'custom_fqdn'
        expected_query = (
            "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
            "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
            "/ 1024")
        result = self.helper._build_prometheus_query(
            'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
        self.assertEqual(result, expected_query)
@@ -574,7 +644,7 @@ class TestPrometheusHelper(base.BaseTestCase):
    def test_build_prometheus_query_instance_cpu_avg_agg(self):
        expected_query = (
-            "clamp_max((avg by (instance)(rate("
+            "clamp_max((avg by (resource)(rate("
            "ceilometer_cpu{resource='uuid-0'}[222s]))"
            "/10e+8) *(100/2), 100)"
        )
@@ -585,7 +655,7 @@ class TestPrometheusHelper(base.BaseTestCase):
    def test_build_prometheus_query_instance_cpu_max_agg(self):
        expected_query = (
-            "clamp_max((max by (instance)(rate("
+            "clamp_max((max by (resource)(rate("
            "ceilometer_cpu{resource='uuid-0'}[555s]))"
            "/10e+8) *(100/4), 100)"
        )
@@ -629,7 +699,7 @@ class TestPrometheusHelper(base.BaseTestCase):
    def test_prometheus_query_custom_uuid_label(self, mock_prometheus_get):
        cfg.CONF.prometheus_client.instance_uuid_label = 'custom_uuid_label'
        expected_query = (
-            "clamp_max((max by (instance)"
+            "clamp_max((max by (custom_uuid_label)"
            "(rate(ceilometer_cpu{custom_uuid_label='uuid-0'}[555s]))"
            "/10e+8) *(100/4), 100)"
        )
--- a/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py
@@ -111,7 +111,7 @@ class TestHostMaintenance(TestBaseStrategy):
        self.strategy.instance_migration(instance_0, node_0, node_1)
        self.assertEqual(1, len(self.strategy.solution.actions))
        expected = [{'action_type': 'migrate',
-                     'input_parameters': {'destination_node': node_1.uuid,
+                     'input_parameters': {'destination_node': node_1.hostname,
                                          'source_node': node_0.uuid,
                                          'migration_type': 'live',
                                          'resource_id': instance_0.uuid,
@@ -144,14 +144,14 @@ class TestHostMaintenance(TestBaseStrategy):
        self.strategy.host_migration(node_0, node_1)
        self.assertEqual(2, len(self.strategy.solution.actions))
        expected = [{'action_type': 'migrate',
-                     'input_parameters': {'destination_node': node_1.uuid,
+                     'input_parameters': {'destination_node': node_1.hostname,
                                          'source_node': node_0.uuid,
                                          'migration_type': 'live',
                                          'resource_id': instance_0.uuid,
                                          'resource_name': instance_0.name
                                          }},
                    {'action_type': 'migrate',
-                     'input_parameters': {'destination_node': node_1.uuid,
+                     'input_parameters': {'destination_node': node_1.hostname,
                                          'source_node': node_0.uuid,
                                          'migration_type': 'live',
                                          'resource_id': instance_1.uuid,
@@ -167,12 +167,15 @@ class TestHostMaintenance(TestBaseStrategy):
        node_1 = model.get_node_by_uuid('Node_1')
        self.assertFalse(self.strategy.safe_maintain(node_0))
        self.assertFalse(self.strategy.safe_maintain(node_1))
        # It will return true, if backup node is passed
        self.assertTrue(self.strategy.safe_maintain(node_0, node_1))
        model = self.fake_c_cluster.\
            generate_scenario_1_with_all_nodes_disable()
        self.m_c_model.return_value = model
        node_0 = model.get_node_by_uuid('Node_0')
-        self.assertTrue(self.strategy.safe_maintain(node_0))
+        # It will return false, if there is no backup node
        self.assertFalse(self.strategy.safe_maintain(node_0))
    def test_try_maintain(self):
        model = self.fake_c_cluster.generate_scenario_1()
@@ -213,7 +216,7 @@ class TestHostMaintenance(TestBaseStrategy):
                         'disabled_reason': 'watcher_maintaining'}},
                    {'action_type': 'migrate',
                     'input_parameters': {
-                         'destination_node': node_3.uuid,
+                         'destination_node': node_3.hostname,
                         'source_node': node_2.uuid,
                         'migration_type': 'live',
                         'resource_id': instance_4.uuid,
--- a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
@@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy):
            len(self.strategy.simulate_migrations(self.hosts_load_assert)))
    def test_check_threshold(self):
        # sd for 0.05, 0.05, 0.07, 0.07, 0.8
        test_cpu_sd = 0.296
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
-        self.strategy.thresholds = {'instance_cpu_usage': 0.001,
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
                                    'instance_ram_usage': 0.2}
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertTrue(self.strategy.check_threshold())
        self.assertEqual(
            round(self.strategy.sd_before_audit, 3),
            test_cpu_sd)
    def test_check_threshold_cpu(self):
        test_cpu_sd = 0.296
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
        self.strategy.metrics = ["instance_cpu_usage"]
        self.strategy.thresholds = {'instance_cpu_usage': 0.25}
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertTrue(self.strategy.check_threshold())
        self.assertEqual(
            round(self.strategy.sd_before_audit, 3),
            test_cpu_sd)
    def test_check_threshold_ram(self):
        # sd for 4,5,7,8, 29 MB used in 132MB total memory hosts
        test_ram_sd = 0.071
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
        self.strategy.metrics = ["instance_ram_usage"]
        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
                                    'instance_ram_usage': 0.05}
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertTrue(self.strategy.check_threshold())
        self.assertEqual(
            round(self.strategy.sd_before_audit, 3),
            test_ram_sd)
    def test_check_threshold_fail(self):
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
        self.strategy.thresholds = {'instance_cpu_usage': 0.3,
                                    'instance_ram_usage': 0.2}
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertFalse(self.strategy.check_threshold())
    def test_check_threshold_cpu_fail(self):
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
        self.strategy.metrics = ["instance_cpu_usage"]
        self.strategy.thresholds = {'instance_cpu_usage': 0.4}
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertFalse(self.strategy.check_threshold())
    def test_check_threshold_ram_fail(self):
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
        self.strategy.metrics = ["instance_ram_usage"]
        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
                                    'instance_ram_usage': 0.1}
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertFalse(self.strategy.check_threshold())
    def test_execute_one_migration(self):
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
--- a/watcher/tests/decision_engine/strategy/strategies/test_zone_migration.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_zone_migration.py
@@ -320,7 +320,10 @@ class TestZoneMigration(TestBaseStrategy):
        migration_types = collections.Counter(
            [action.get('input_parameters')['migration_type']
             for action in solution.actions])
-        self.assertEqual(1, migration_types.get("swap", 0))
+        # watcher no longer implements swap. it is now an
        # alias for migrate.
        self.assertEqual(0, migration_types.get("swap", 0))
        self.assertEqual(1, migration_types.get("migrate", 1))
        global_efficacy_value = solution.global_efficacy[3].get('value', 0)
        self.assertEqual(100, global_efficacy_value)
--- a/watcher/wsgi/init.py
+++ b/watcher/wsgi/init.py
--- a/watcher/wsgi/api.py
+++ b/watcher/wsgi/api.py
@@ -0,0 +1,18 @@
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 """WSGI application entry-point for Watcher API."""
 import threading
 from watcher.api import wsgi
 application = None
 with threading.Lock():
    if application is None:
        application = wsgi.initialize_wsgi_app()
Author	SHA1	Message	Date
Zuul	024815af71	Merge "use cinder migrate for swap volume" into stable/2025.1	2025-08-19 13:39:36 +00:00
Sean Mooney	ffec800f59	use cinder migrate for swap volume This change removes watchers in tree functionality for swapping instance volumes and defines swap as an alias of cinder volume migrate. The watcher native implementation was missing error handling which could lead to irretrievable data loss. The removed code also forged project user credentials to perform admin request as if it was done by a member of a project. this was unsafe an posses a security risk due to how it was implemented. This code has been removed without replacement. While some effort has been made to allow existing audits that were defined to work, any reduction of functionality as a result of this security hardening is intentional. Closes-Bug: #2112187 Change-Id: Ic3b6bfd164e272d70fe86d7b182478dd962f8ac0 Signed-off-by: Sean Mooney <work@seanmooney.info> (cherry picked from commit `3742e0a79c`)	2025-08-18 16:36:38 +00:00
Douglas Viroel	defd3953d8	Configure watcher tempest's microversion in devstack Adds a tempest configuration for min and max microversions supported by watcher. This help us to define the correct range of microversion to be tested on each stable branch. New microversion proposals should also increase the default max_microversion, in order to work with watcher-tempest-plugin microversion testing. Change-Id: I0b695ba4530eb89ed17b3935b87e938cadec84cc (cherry picked from commit `adfe3858aa`) Signed-off-by: Douglas Viroel <viroel@gmail.com>	2025-08-04 12:18:53 +00:00
Zuul	de75a2a5b2	Merge "Return HTTP code 400 when creating an audit with wrong parameters" into stable/2025.1	2025-07-21 19:28:47 +00:00
Zuul	7c15812d68	Merge "Add a unit test to check the error when creating an audit with wrong parameters" into stable/2025.1	2025-07-21 19:28:45 +00:00
Zuul	c47f6fb66c	Merge "Fix audit creation with no name and no goal or audit_template" into stable/2025.1	2025-07-21 19:28:43 +00:00
Zuul	a4ece6f084	Merge "Added unit test to validate audit creation with no goal and no name" into stable/2025.1	2025-07-21 19:25:04 +00:00
Zuul	758acdfb99	Merge "Set actionplan state to FAILED if any action has failed" into stable/2025.1	2025-07-08 19:46:06 +00:00
Zuul	b65cfc283a	Merge "Add unit test to check action plan state when a nested action fails" into stable/2025.1	2025-07-08 19:46:05 +00:00
Zuul	f5a21ba43e	Merge "Aggregate by label when querying instance cpu usage in prometheus" into stable/2025.1	2025-07-08 12:41:34 +00:00
Alfredo Moralejo	ba417b38bf	Fix audit creation with no name and no goal or audit_template Currently, in that case it was failing because watcher tried to create a name based on a goal automatically and the goal is not defined. This patch is moving the check for goal specification in the audit creation call earlier, and if there is not goal defined, it returns an invalid call error. This patch is also modifying the existing error for this case to check the expected behavior. Closes-Bug: #2110947 Change-Id: I6f3d73b035e8081e86ce82c205498432f0e0fc33 (cherry picked from commit `bf6a28bd1e`) Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>	2025-07-02 08:16:12 +02:00
Alfredo Moralejo	38622442f2	Set actionplan state to FAILED if any action has failed Currently, an actionplan state is set to SUCCEEDED once the execution has finished, but that does not imply that all the actions finished successfully. This patch is checking the actual state of all the actions in the plan after the execution has finished. If any action has status FAILED, it will set the state of the action plan as FAILED and will apply the appropiate notification parameters. This is the expected behavior according to Watcher documentation. The patch is also fixing the unit test for this to set the expected action plan state to FAILED and notification parameters. Closes-Bug: #2106407 Change-Id: I7bfc6759b51cd97c26ec13b3918bd8d3b7ac9d4e (cherry picked from commit `88d81c104e`) Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>	2025-07-02 08:12:45 +02:00
Alfredo Moralejo	c7fde92411	Add unit test to check action plan state when a nested action fails This patch is adding a new unit test to check the behavior of the action plan when one of the actions in it fails during execution. Note this is to show a bug, and the expected state will be changed in the fixing patch. Related-Bug: #2106407 Change-Id: I2f3fe8f4da772a96db098066d253e5dee330101a (cherry picked from commit `b36ba8399e`) Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>	2025-07-02 08:12:13 +02:00
Alfredo Moralejo	e5b5ff5d56	Return HTTP code 400 when creating an audit with wrong parameters Currently, when trying to create an audit which misses a mandatory parameter watcher returns error 500 instead of 400 which is the documented error in the API [1] and the appropiate error code for malformed requests. This patch catch parameters validation errors according to the json schema for each strategy and returns error 400. It also fixes the unit test to validate the expected behavior. [1] https://docs.openstack.org/api-ref/resource-optimization/#audits Closes-Bug: #2110538 Change-Id: I23232b3b54421839bb01d54386d4e7b244f4e2a0 (cherry picked from commit `4629402f38`) Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>	2025-07-02 08:07:26 +02:00
Zuul	3a923dbf16	Merge "Use KiB as unit for host_ram_usage when using prometheus datasource" into stable/2025.1	2025-06-27 16:59:14 +00:00
Alfredo Moralejo	fb85b27ae3	Use KiB as unit for host_ram_usage when using prometheus datasource The prometheus datasource was reporting host_ram_usage in MiB as described in the docstring for the base datasource interface definition [1]. However, the gnocchi datasource is reporting it in KiB following ceilometer metric `hardware.memory.used` [2] and the strategies using that metric expect it to be in KiB so the best approach is to change the unit in the prometheus datasource and update the docstring to avoid missunderstandings in future. So, this patch is fixing the prometheus datasource to return host_ram_usage in KiB instead of MiB. Additionally, it is adding more unit tests for the check_threshold method so that it covers the memory based strategy execution, validates the calculated standard deviation and adds the cases where it is below the threshold. [1] `15981117ee/watcher/decision_engine/datasources/base.py (L177-L183)` [2] https://docs.openstack.org/ceilometer/train/admin/telemetry-measurements.html#snmp-based-meters Closes-Bug: #2113776 Change-Id: Idc060d1e709c0265c64ada16062c3a206c6b04fa (cherry picked from commit `6ea362da0b`)	2025-06-20 16:35:40 +00:00
Alfredo Moralejo	53872f9af2	Aggregate by label when querying instance cpu usage in prometheus Currently, when the prometheus datasource query ceilometer_cpu metric for instance cpu usage, it aggregates by instance and filter by the label containing the instance uuid. While this works fine in real scenarios, where a single metric is provided in a single instance, in some cases as the CI jobs where metrics are directly injected, leads to incorrect metric calculation. We applied a similar fix for the host metrics in [1] but we did not implement it for instance cpu. I am also converting the query formatting to the dict format to improve understability. [1] https://review.opendev.org/c/openstack/watcher/+/946049 Closes-Bug: #2113936 Change-Id: I3038dec20612162c411fc77446e86a47e0354423 (cherry picked from commit `3860de0b1e`)	2025-06-19 14:54:56 +00:00
Chandan Kumar (raukadah)	c0ebb8ddb3	Drop code from Host maintenance strategy migrating instance to disabled hosts Currently host maintenance strategy also migrate instances from maintenance node to watcher_disabled compute nodes. watcher_disabled compute nodes might be disabled for some other purpose by different strategy. If host maintenace use those compute nodes for migration, It might affect customer workloads. Host maintenance strategy should never touch disabled hosts unless the user specify a disable host as backup node. This cr drops the logic for using disabled compute node for maintenance. Host maintaince is already using nova schedular for migrating the instance, will use the same. If there is no available node, strategy will fail. Closes-Bug: #2109945 Change-Id: If9795fd06f684eb67d553405cebd8a30887c3997 Signed-off-by: Chandan Kumar (raukadah) <chkumar@redhat.com> (cherry picked from commit `9dea55bd64`)	2025-06-09 19:40:41 +05:30
Alfredo Moralejo	1d7f163651	Added unit test to validate audit creation with no goal and no name This patch is adding a new unit test to validate the behavior of the API when trying to create an audit without a goal (whether using a goal or audit template parameters) and no name is provided. Related-Bug: https://bugs.launchpad.net/watcher/+bug/2110947 Change-Id: I04df10a8a0eea4509856f2f4b9d11bae24cd563a (cherry picked from commit `0651fff910`)	2025-06-04 13:06:14 +00:00
Alfredo Moralejo	c6ceaacf27	Add a unit test to check the error when creating an audit with wrong parameters Currently, it is returning http error code 500 instead of 400, which would be the appropiate code. A follow-up patch will be sent with the vix and switching the error code and message. Related-Bug: #2110538 Change-Id: I35ccbb9cf29fc08e78c4d5f626a6518062efbed3 (cherry picked from commit `891119470c`)	2025-06-04 12:52:40 +00:00
Chandan Kumar (raukadah)	f4bfb10525	[host_maintenance] Pass des hostname in add_action solution Currently we are passing src_node and des_node uuid when we try to run migrate action. In the watcher-applier log, migration fails with following exception ``` Nova client exception occurred while live migrating instance <uuid>Exception: Compute host <uuid> could not be found ``` Based on `57f55190ff/watcher/applier/actions/migration.py (L122)` and `57f55190ff/watcher/common/nova_helper.py (L322)`, live_migrate_instance expects destination hostname not uuid. This cr replaces dest_node uuid to hostname. Closes-Bug: #2109309 Change-Id: I3911ff24ea612f69dddae5eab15fabb4891f938d Signed-off-by: Chandan Kumar (raukadah) <chkumar@redhat.com> (cherry picked from commit `278cb7e98c`)	2025-05-05 02:53:03 +00:00
Sean Mooney	8a99d4c5c1	Add support for pyproject.toml and wsgi module paths pip 23.1 removed the "setup.py install" fallback for projects that do not have pyproject.toml and now uses a pyproject.toml which is vendored in pip [1][2]. pip 24.2 has now deprecated a similar fallback to "setup.py develop" and plans to fully remove this in pip 25.0 [3][4][5]. pbr supports editable installs since 6.0.0 pip 25.1 has now been released and the removal is complete. by adding our own minimal pyproject.toml to ensure we are using the correct build system. This change also requires that we adapt how we generate our wsgi entry point. when pyproject.toml is used the wsgi console script is not generated in an editbale install such as is used in devstck To adress this we need to refactor our usage of our wsgi applciation to use a module path instead. This change does not remove the declaration of our wsgi_scrtip entry point but it shoudl be considered deprecated and it will be removed in the future. To unblock the gate the devstack plugin is modifed to to deploy using the wsgi module instead of the console script. Finally supprot for the mod_wsgi wsgi mode is removed. that was deprecated in devstack a few cycle ago and support was removed in I8823e98809ed6b66c27dbcf21a00eea68ef403e8 [1] https://pip.pypa.io/en/stable/news/#v23-1 [2] https://github.com/pypa/pip/issues/8368 [3] https://pip.pypa.io/en/stable/news/#v24-2 [4] https://github.com/pypa/pip/issues/11457 [5] https://ichard26.github.io/blog/2024/08/whats-new-in-pip-24.2/ Closes-Bug: #2109608 Change-Id: Iad77939ab0403c5720c549f96edfc77d2b7d90ee	2025-05-01 00:19:23 +00:00
Alfredo Moralejo	ce9f0b4c1e	Skip real-data tests in non-real-data jobs I am excluding strategies execution with annotation `real_load` in non-real-load jobs. This is partial backport of [1]. [1] https://review.opendev.org/c/openstack/watcher/+/945627 Change-Id: I77d4c23ebc21693bba8ca0247b8954c6dc8eaba9	2025-04-24 17:02:21 +02:00
Alfredo Moralejo	e385ece629	Aggregate by fqdn label instead instance in host cpu metrics While in a regular case a specific metric for a specific host will be provider by a single instance (exporter) so aggregating by label and by intances should be the same, it is more correct to aggregate by the same label that the one we use to filter the metrics. This is follow up of https://review.opendev.org/c/openstack/watcher/+/944795 Related-Bug: #2103451 Change-Id: Ia61f051547ddc51e0d1ccd5a56485ab49ce84c2e (cherry picked from commit `c7158b08d1`)	2025-04-09 09:00:17 +02:00
Alfredo Moralejo	c6505ad06f	Query by fqdn_label instead of instance for host metrics Currently we are using `instance` label to query about host metrics to prometheus. This label is assigned to the url of each endpoint being scrapped. While this work fine in one-exporter-per-compute cases as the driver is mapping the fqdn_label value to the `instance` label value, it fails when there are more that one target with the same value for the fqdn label. This is a valid case, to be able to query by fqdn and do not care about what exporter in the host is providing the metric. This patch is changing the queries we use for hosts to be based on the fqdn_label instead of the instance one. To implement it, we are also simplifying the way we check the metric exist for the host by converting prometheus_fqdn_instance_map into a prometheus_fqdn_labels set which stores the list of fqdn found in prometheus. Closes-Bug: #2103451 Change-Id: I3bcc317441b73da5c876e53edd4622370c6d575e (cherry picked from commit `a65e7e9b59`)	2025-04-09 08:59:52 +02:00
Chandan Kumar (raukadah)	64f70b948d	Drop sg_core prometheus related vars The depends-on pr removes the installation of promotheus[1] and node exporter[2] from sg_core. We no longer need to define those vars in the devstack config. Links: [1]. https://github.com/openstack-k8s-operators/sg-core/pull/21 [2]. https://github.com/openstack-k8s-operators/sg-core/pull/23 Note: We do not need to enable sg_core service on compute node, so removing it's plugin call. Change-Id: Ie8645813a360605635de4dff9e8d1ba0d7a0cdc3 Signed-off-by: Chandan Kumar (raukadah) <raukadah@gmail.com> (cherry picked from commit `0702cb3869`)	2025-04-09 08:58:28 +02:00
OpenStack Release Bot	68c9ce65d2	Update TOX_CONSTRAINTS_FILE for stable/2025.1 Update the URL to the upper-constraints file to point to the redirect rule on releases.openstack.org so that anyone working on this branch will switch to the correct upper-constraints list automatically when the requirements repository branches. Until the requirements repository has as stable/2025.1 branch, tests will continue to use the upper-constraints list on master. Change-Id: I29d11e287122d21e62bd6266a193db480dcc4a23	2025-03-13 13:51:53 +00:00
OpenStack Release Bot	5fa0926528	Update .gitreview for stable/2025.1 Change-Id: Ic5083f5a799b269aee36b2c83408f0ba7cbded0d	2025-03-13 13:51:51 +00:00