From 878d8bfca7a9d46fc4089225d0f4fb911ea3a1c5 Mon Sep 17 00:00:00 2001
From: tazlin <tazlin.on.github@gmail.com>
Date: Thu, 7 Mar 2024 07:36:19 -0500
Subject: [PATCH 1/5] fix: increase timeout rate of lora metadata/model
 downloads

---
 hordelib/model_manager/lora.py | 51 ++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/hordelib/model_manager/lora.py b/hordelib/model_manager/lora.py
index 4ea032e9..9ba110e7 100644
--- a/hordelib/model_manager/lora.py
+++ b/hordelib/model_manager/lora.py
@@ -44,14 +44,19 @@ class LoraModelManager(BaseModelManager):
     )
     LORA_API = "https://civitai.com/api/v1/models?types=LORA&sort=Highest%20Rated&primaryFileOnly=true"
     MAX_RETRIES = 10 if not TESTS_ONGOING else 3
-    MAX_DOWNLOAD_THREADS = 3
+    MAX_DOWNLOAD_THREADS = 5 if not TESTS_ONGOING else 15
     RETRY_DELAY = 3 if not TESTS_ONGOING else 0.2
     """The time to wait between retries in seconds"""
-    REQUEST_METADATA_TIMEOUT = 20
-    """The time to wait for a response from the server in seconds"""
-    REQUEST_DOWNLOAD_TIMEOUT = 300
-    """The time to wait for a response from the server in seconds"""
-    THREAD_WAIT_TIME = 2
+    REQUEST_METADATA_TIMEOUT = 20  # Longer because civitai performs poorly on metadata requests for more than 5 models
+    """The maximum time for no data to be received before we give up on a metadata fetch, in seconds"""
+    REQUEST_DOWNLOAD_TIMEOUT = 10 if not TESTS_ONGOING else 1
+    """The maximum time for no data to be received before we give up on a download, in seconds
+
+    This is not the time to download the file, but the time to wait in between data packets. \
+    If we're actively downloading and the connection to the server is alive, this doesn't come into play
+    """
+
+    THREAD_WAIT_TIME = 0.1
     """The time to wait between checking the download queue in seconds"""
 
     _file_lock: multiprocessing_lock | nullcontext
@@ -274,6 +279,7 @@ def _add_lora_ids_to_download_queue(self, lora_ids, adhoc=False, version_compare
     def _get_json(self, url):
         retries = 0
         while retries <= self.MAX_RETRIES:
+            response = None
             try:
                 response = requests.get(url, timeout=self.REQUEST_METADATA_TIMEOUT)
                 response.raise_for_status()
@@ -281,14 +287,31 @@ def _get_json(self, url):
                 return response.json()
 
             except (requests.HTTPError, requests.ConnectionError, requests.Timeout, json.JSONDecodeError) as e:
-                # CivitAI Errors when the model ID is too long
-                if response.status_code in [404, 500]:
+                logger.debug(f"url '{url}' download failed {type(e)} {e}")
+
+                # If this is a 401, 404, or 500, we're not going to get anywhere, just give up
+                # The following are the CivitAI errors encountered so far
+                # (and all of them will not fix themselves with retries as of writing)
+                # [401: requires a token, 404: model ID too long, 500: internal server error]
+                if response is not None and response.status_code in [401, 404, 500]:
                     logger.debug(f"url '{url}' download failed with status code {response.status_code}")
                     return None
 
-                logger.debug(f"url '{url}' download failed {type(e)} {e}")
+                # The json being invalid is a CivitAI issue, possibly it showing an HTML page and
+                # this isn't likely to change in the next 30 seconds, so we'll try twice more
+                # and give up if it doesn't work
+                if isinstance(e, json.JSONDecodeError):
+                    logger.debug(f"url '{url}' download failed with {type(e)} {e}")
+                    retries += 3
+
+                # If the network connection timed out, then self.REQUEST_METADATA_TIMEOUT seconds passed
+                # and the clock is ticking, so we'll try once more
+                if response is None:
+                    retries += 5
+
                 retries += 1
                 self.total_retries_attempted += 1
+
                 if retries <= self.MAX_RETRIES:
                     time.sleep(self.RETRY_DELAY)
                 else:
@@ -674,8 +697,8 @@ def clear_all_references(self):
     def wait_for_downloads(self, timeout=None):
         rtr = 0
         while not self.are_downloads_complete():
-            time.sleep(0.5)
-            rtr += 0.5
+            time.sleep(self.THREAD_WAIT_TIME)
+            rtr += self.THREAD_WAIT_TIME
             if timeout and rtr > timeout:
                 raise Exception(f"Lora downloads exceeded specified timeout ({timeout})")
         logger.debug("Downloads complete")
@@ -973,7 +996,7 @@ def reset_adhoc_loras(self):
             if self._stop_all_threads:
                 logger.debug("Stopped processing thread")
                 return
-            time.sleep(0.2)
+            time.sleep(self.THREAD_WAIT_TIME)
         now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         self._adhoc_loras = set()
         unsorted_items = []
@@ -1073,8 +1096,8 @@ def is_adhoc_reset_complete(self):
     def wait_for_adhoc_reset(self, timeout=15):
         rtr = 0
         while not self.is_adhoc_reset_complete():
-            time.sleep(0.2)
-            rtr += 0.2
+            time.sleep(self.THREAD_WAIT_TIME)
+            rtr += self.THREAD_WAIT_TIME
             if timeout and rtr > timeout:
                 raise Exception(f"Lora adhoc reset exceeded specified timeout ({timeout})")
 

From 8e67a3b38a985356be9bb167793a0527dde77c8e Mon Sep 17 00:00:00 2001
From: tazlin <tazlin.on.github@gmail.com>
Date: Thu, 7 Mar 2024 07:43:25 -0500
Subject: [PATCH 2/5] fix: give longer metadata queries more time

---
 hordelib/model_manager/lora.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hordelib/model_manager/lora.py b/hordelib/model_manager/lora.py
index 9ba110e7..dd6e2b1a 100644
--- a/hordelib/model_manager/lora.py
+++ b/hordelib/model_manager/lora.py
@@ -281,7 +281,10 @@ def _get_json(self, url):
         while retries <= self.MAX_RETRIES:
             response = None
             try:
-                response = requests.get(url, timeout=self.REQUEST_METADATA_TIMEOUT)
+                response = requests.get(
+                    url,
+                    timeout=self.REQUEST_METADATA_TIMEOUT if len(url) < 200 else self.REQUEST_METADATA_TIMEOUT * 1.5,
+                )
                 response.raise_for_status()
                 # Attempt to decode the response to JSON
                 return response.json()

From 30966b4438c9aba7c6ccf071e2c1a4176ba5ca36 Mon Sep 17 00:00:00 2001
From: tazlin <tazlin.on.github@gmail.com>
Date: Thu, 7 Mar 2024 08:15:37 -0500
Subject: [PATCH 3/5] fix: retry less often with TI model manager also

---
 hordelib/model_manager/ti.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hordelib/model_manager/ti.py b/hordelib/model_manager/ti.py
index 30b14fb6..29c1a722 100644
--- a/hordelib/model_manager/ti.py
+++ b/hordelib/model_manager/ti.py
@@ -151,6 +151,7 @@ def _add_ti_ids_to_download_queue(self, ti_ids, adhoc=False, version_compare=Non
     def _get_json(self, url):
         retries = 0
         while retries <= self.MAX_RETRIES:
+            response = None
             try:
                 response = requests.get(url, timeout=self.REQUEST_METADATA_TIMEOUT)
                 response.raise_for_status()
@@ -159,8 +160,12 @@ def _get_json(self, url):
 
             except (requests.HTTPError, requests.ConnectionError, requests.Timeout, json.JSONDecodeError):
                 # CivitAI Errors when the model ID is too long
-                if response.status_code in [404, 500]:
+                if response is not None and response.status_code in [404, 500]:
                     return None
+
+                if response is None:
+                    retries += 5
+
                 retries += 1
                 self.total_retries_attempted += 1
                 if retries <= self.MAX_RETRIES:

From 51d9541d7bc0a1ca3238017abfb616df04341d09 Mon Sep 17 00:00:00 2001
From: tazlin <tazlin.on.github@gmail.com>
Date: Thu, 7 Mar 2024 09:00:32 -0500
Subject: [PATCH 4/5] fix: retry 500s a few times on lora/ti metadata dl
 timeout

---
 hordelib/model_manager/lora.py | 11 +++++++----
 hordelib/model_manager/ti.py   | 11 +++++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/hordelib/model_manager/lora.py b/hordelib/model_manager/lora.py
index dd6e2b1a..f9f0623e 100644
--- a/hordelib/model_manager/lora.py
+++ b/hordelib/model_manager/lora.py
@@ -294,11 +294,14 @@ def _get_json(self, url):
 
                 # If this is a 401, 404, or 500, we're not going to get anywhere, just give up
                 # The following are the CivitAI errors encountered so far
-                # (and all of them will not fix themselves with retries as of writing)
                 # [401: requires a token, 404: model ID too long, 500: internal server error]
-                if response is not None and response.status_code in [401, 404, 500]:
-                    logger.debug(f"url '{url}' download failed with status code {response.status_code}")
-                    return None
+                if response is not None:
+                    if response.status_code in [401, 404]:
+                        logger.debug(f"url '{url}' download failed with status code {response.status_code}")
+                        return None
+                    if response.status_code == 500:
+                        logger.debug(f"url '{url}' download failed with status code {response.status_code}")
+                        retries += 3
 
                 # The json being invalid is a CivitAI issue, possibly it showing an HTML page and
                 # this isn't likely to change in the next 30 seconds, so we'll try twice more
diff --git a/hordelib/model_manager/ti.py b/hordelib/model_manager/ti.py
index 29c1a722..e8c7ffc0 100644
--- a/hordelib/model_manager/ti.py
+++ b/hordelib/model_manager/ti.py
@@ -160,8 +160,15 @@ def _get_json(self, url):
 
             except (requests.HTTPError, requests.ConnectionError, requests.Timeout, json.JSONDecodeError):
                 # CivitAI Errors when the model ID is too long
-                if response is not None and response.status_code in [404, 500]:
-                    return None
+                if response is not None:
+                    if response.status_code in [401, 404]:
+                        return None
+                    if response.status_code == 500:
+                        retries += 3
+                        logger.debug(
+                            "CivitAI reported an internal error when downloading metadata. "
+                            "Fewer retries will be attempted.",
+                        )
 
                 if response is None:
                     retries += 5

From 820b1c3a45dbc0aef4cea13622ce275bbb3bfcf9 Mon Sep 17 00:00:00 2001
From: tazlin <tazlin.on.github@gmail.com>
Date: Thu, 7 Mar 2024 09:58:31 -0500
Subject: [PATCH 5/5] tests: corrects non existing lora test to new logic

---
 tests/model_managers/test_mm_lora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model_managers/test_mm_lora.py b/tests/model_managers/test_mm_lora.py
index 40c8b549..138137f5 100644
--- a/tests/model_managers/test_mm_lora.py
+++ b/tests/model_managers/test_mm_lora.py
@@ -220,7 +220,7 @@ def test_adhoc_non_existing_intstring_large(self):
         lora_model_manager.wait_for_adhoc_reset(15)
         lora_name = "99999999999999"
         lora_key = lora_model_manager.fetch_adhoc_lora(lora_name)
-        assert lora_model_manager.total_retries_attempted == 0
+        assert lora_model_manager.total_retries_attempted == 1
         assert lora_key is None
         assert not lora_model_manager.is_model_available(lora_name)
         lora_model_manager.stop_all()