From 5fa2a8e11d73e0b5947da1dd8350fc955867de25 Mon Sep 17 00:00:00 2001 From: Mengxin Liu Date: Tue, 30 Aug 2022 16:04:47 +0800 Subject: [PATCH] feat: reduce downtime by increasing arp cache timeout When upgrade ovs-ovn if the arp cache timeout, new arp request cannot be processed and leads network unreachable error. Increase the base_reachable_time_ms and gc_stale_time to 3 minutes to prevent arp cache timeout during upgrade. (cherry picked from commit 0adecb0cb7217e77210b6afcf8b7bab530ae98b5) --- dist/images/install.sh | 12 ++++++++++++ dist/images/start-ovs.sh | 22 +++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/dist/images/install.sh b/dist/images/install.sh index 7cc36631cb2..94cbd45ccc3 100755 --- a/dist/images/install.sh +++ b/dist/images/install.sh @@ -1227,6 +1227,9 @@ spec: - name: OVN_DB_IPS value: $addresses volumeMounts: + - mountPath: /var/run/netns + name: host-ns + mountPropagation: HostToContainer - mountPath: /lib/modules name: host-modules readOnly: true @@ -1294,6 +1297,9 @@ spec: - name: host-sys hostPath: path: /sys + - name: host-ns + hostPath: + path: /var/run/netns - name: cni-conf hostPath: path: /etc/cni/net.d @@ -1732,6 +1738,9 @@ spec: - name: OVN_DB_IPS value: $addresses volumeMounts: + - mountPath: /var/run/netns + name: host-ns + mountPropagation: HostToContainer - mountPath: /lib/modules name: host-modules readOnly: true @@ -1795,6 +1804,9 @@ spec: - name: host-sys hostPath: path: /sys + - name: host-ns + hostPath: + path: /var/run/netns - name: cni-conf hostPath: path: /etc/cni/net.d diff --git a/dist/images/start-ovs.sh b/dist/images/start-ovs.sh index 70c44408e76..df7650bbbfa 100755 --- a/dist/images/start-ovs.sh +++ b/dist/images/start-ovs.sh @@ -35,9 +35,18 @@ cat /proc/cmdline" fi function quit { - /usr/share/ovn/scripts/grace_stop_ovn_controller - /usr/share/openvswitch/scripts/ovs-ctl stop - exit 0 + set +e + for netns in /var/run/netns/*; do + nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.base_reachable_time_ms=180000; + nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.gc_stale_time=180; + done + # If the arp is in stale or delay status, stop vswitchd will lead prob failed. + # Wait a while for prob ready. + # As the timeout has been increased existing entry will not change to stale or delay at the moment + sleep 5 + /usr/share/ovn/scripts/grace_stop_ovn_controller + /usr/share/openvswitch/scripts/ovs-ctl stop + exit 0 } trap quit EXIT @@ -182,5 +191,12 @@ set -e ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="false" +set +e +for netns in /var/run/netns/*; do + nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.base_reachable_time_ms=30000; + nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.gc_stale_time=60; +done +set -e + chmod 600 /etc/openvswitch/* tail --follow=name --retry /var/log/ovn/ovn-controller.log