Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix.confirmed 2 failed after push ok #317

Merged
merged 11 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/api/config/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ mgmtdomain_reserved_count: 5
mgmtdomain_primary_ip_version: 4
commit_confirmed_mode: 1
commit_confirmed_timeout: 300
commit_confirmed_wait: 1
2 changes: 2 additions & 0 deletions docs/configuration/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Defines parameters for the API:
(see :ref:`commit_confirm_modes`). Defaults to 1.
- commit_confirmed_timeout: Time to wait before rolling back an unconfirmed commit,
specified in seconds. Defaults to 300.
- commit_confirmed_wait: Time to wait between comitting configuration and checking
that the device is still reachable, specified in seconds. Defaults to 1.

/etc/cnaas-nms/repository.yml
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions src/cnaas_nms/app_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class ApiSettings(BaseSettings):
MGMTDOMAIN_PRIMARY_IP_VERSION: int = 4
COMMIT_CONFIRMED_MODE: int = 1
COMMIT_CONFIRMED_TIMEOUT: int = 300
COMMIT_CONFIRMED_WAIT: int = 1
SETTINGS_OVERRIDE: Optional[dict] = None

@validator("MGMTDOMAIN_PRIMARY_IP_VERSION")
Expand Down Expand Up @@ -90,6 +91,7 @@ def construct_api_settings() -> ApiSettings:
MGMTDOMAIN_PRIMARY_IP_VERSION=config.get("mgmtdomain_primary_ip_version", 4),
COMMIT_CONFIRMED_MODE=config.get("commit_confirmed_mode", 1),
COMMIT_CONFIRMED_TIMEOUT=config.get("commit_confirmed_timeout", 300),
COMMIT_CONFIRMED_WAIT=config.get("commit_confirmed_wait", 1),
SETTINGS_OVERRIDE=config.get("settings_override", None),
)
else:
Expand Down
49 changes: 40 additions & 9 deletions src/cnaas_nms/devicehandler/sync_devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,33 @@
task_args["job_id"] = job_id
task_args["confirm_mode_override"] = confirm_mode
logger.debug("Commit confirm mode for host {}: {} (dry_run: {})".format(task.host.name, confirm_mode, dry_run))
task.run(**task_args)
if confirm_mode != 2:
task.host.close_connection("napalm")
try:
task.run(**task_args)
except Exception as e:
logger.exception("Exception while running task napalm_configure for device {}".format(task.host.name))
raise e

Check warning on line 564 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L562-L564

Added lines #L562 - L564 were not covered by tests
finally:
if confirm_mode != 2:
task.host.close_connection("napalm")
if confirm_mode == 2 and not dry_run:
time.sleep(api_settings.COMMIT_CONFIRMED_WAIT)
try:
task.run(task=napalm_get, getters=["facts"], name="Verify reachability")
except Exception as e:
add_sync_event(task.host.name, "commit_confirm_failed", scheduled_by, job_id)
logger.error(

Check warning on line 574 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L572-L574

Added lines #L572 - L574 were not covered by tests
"Could not reach device {} after commit, rollback in: {}s".format(
task.host.name, api_settings.COMMIT_CONFIRMED_TIMEOUT
)
)
raise e

Check warning on line 579 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L579

Added line #L579 was not covered by tests
else:
short_facts = {"fqdn": "unknown"}
try:
short_facts["fqdn"] = task.results[2].result["facts"]["fqdn"]
task.results[2].result["facts"] = short_facts
except Exception:
pass

Check warning on line 586 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L585-L586

Added lines #L585 - L586 were not covered by tests

if task.results[1].diff:
config = task.results[1].host["config"]
Expand Down Expand Up @@ -855,6 +879,7 @@
task=push_sync_device,
dry_run=dry_run,
job_id=job_id,
scheduled_by=scheduled_by,
confirm_mode=get_confirm_mode(confirm_mode_override),
)
except Exception as e:
Expand All @@ -881,7 +906,7 @@
unchanged_hosts = []
# calculate change impact score
for host, results in nrresult.items():
if host in failed_hosts or len(results) != 3:
if host in failed_hosts or len(results) < 3:
logger.debug("Unable to calculate change score for failed device {}".format(host))
elif results[2].diff:
changed_hosts.append(host)
Expand Down Expand Up @@ -923,6 +948,12 @@
remove_sync_events(hostname)
dev.last_seen = datetime.datetime.utcnow()
if not dry_run and get_confirm_mode(confirm_mode_override) != 2:
if failed_hosts:
logger.error(

Check warning on line 952 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L952

Added line #L952 was not covered by tests
"One or more devices failed to commit configuration, they will roll back configuration"
" in {}s: {}".format(api_settings.COMMIT_CONFIRMED_TIMEOUT, ", ".join(failed_hosts))
)
time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT)

Check warning on line 956 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L956

Added line #L956 was not covered by tests
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)

Expand Down Expand Up @@ -958,18 +989,18 @@
f"{total_change_score} is higher than auto-push limit {AUTOPUSH_MAX_SCORE}"
)
elif get_confirm_mode(confirm_mode_override) == 2 and not dry_run:
if not changed_hosts:
logger.info("None of the selected host has any changes (diff), skipping commit-confirm")
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
elif len(failed_hosts) > 0:
if failed_hosts:
logger.error(
"No confirm job scheduled since one or more devices failed in commitmode 2"
", all devices will rollback in {}s".format(api_settings.COMMIT_CONFIRMED_TIMEOUT)
)
time.sleep(api_settings.COMMIT_CONFIRMED_TIMEOUT)
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)
elif not changed_hosts:
logger.info("None of the selected host has any changes (diff), skipping commit-confirm")
logger.info("Releasing lock for devices from syncto job: {}".format(job_id))
Joblock.release_lock(session, job_id=job_id)

Check warning on line 1003 in src/cnaas_nms/devicehandler/sync_devices.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/devicehandler/sync_devices.py#L1001-L1003

Added lines #L1001 - L1003 were not covered by tests
else:
scheduler = Scheduler()
next_job_id = scheduler.add_onetime_job(
Expand Down
10 changes: 8 additions & 2 deletions src/cnaas_nms/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@

def parse_redis_event(event):
try:
# [stream, [(messageid, {datadict})]
# [stream, [(messageid, {datadict})]]
if event[0] == "events":
return event[1][0][1]
except Exception: # noqa: S110
Expand All @@ -115,13 +115,19 @@
def thread_websocket_events():
redis: StrictRedis
with redis_session() as redis:
last_event = b"$"

Check warning on line 118 in src/cnaas_nms/run.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/run.py#L118

Added line #L118 was not covered by tests
while True:
result = redis.xread({"events": b"$"}, count=10, block=200)
result = redis.xread({"events": last_event}, count=10, block=200)

Check warning on line 120 in src/cnaas_nms/run.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/run.py#L120

Added line #L120 was not covered by tests
for item in result:
event = parse_redis_event(item)
if not event:
continue
emit_redis_event(event)
try:

Check warning on line 126 in src/cnaas_nms/run.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/run.py#L126

Added line #L126 was not covered by tests
# [stream, [(messageid, {datadict})]]
last_event = item[1][0][0]
except Exception: # noqa: S110
last_event = b"$"

Check warning on line 130 in src/cnaas_nms/run.py

View check run for this annotation

Codecov / codecov/patch

src/cnaas_nms/run.py#L128-L130

Added lines #L128 - L130 were not covered by tests
if stop_websocket_threads:
break

Expand Down
1 change: 1 addition & 0 deletions src/cnaas_nms/scheduler/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ def add_onetime_job(

kwargs["job_id"] = job_id
kwargs["scheduled_by"] = scheduled_by
kwargs["kwargs"]["scheduled_by"] = scheduled_by
if self.use_mule:
try:
import uwsgi
Expand Down
Loading