vllm-project · tjohnson31415 · Jul 24, 2024 · Jul 24, 2024 · Jul 29, 2024 · Jul 30, 2024
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -13,7 +13,7 @@
 
     class RayWorkerWrapper(WorkerWrapperBase):
         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
-        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+        lazily initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 
         def __init__(self, *args, **kwargs) -> None:
             super().__init__(*args, **kwargs)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -1,6 +1,7 @@
 import contextlib
 from typing import Dict, Optional, Type
 
+import ray
 from transformers import GenerationConfig, PretrainedConfig
 
 from vllm.envs import VLLM_USE_MODELSCOPE
@@ -70,6 +71,27 @@ def get_config(model: str,
             logger.info("Updating %s from %r to %r", key,
                         getattr(config, key, None), value)
             config.update({key: value})
+
+    if trust_remote_code:
+        # With trust_remote_code, the config is typically an instance of a
+        # custom class imported from the HF modules cache.
+        #
+        # The class will not be importable in Ray workers by default (and won't
+        # exist at all on other nodes), which breaks serialization of the
+        # config. Here we tell the serialization library used by Ray to pass
+        # instances of these generated classes by value instead of by reference
+        # (eg. the class definition is serialized along with its data).
+        #
+        # See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
+        try:
+            import transformers_modules
+            ray.cloudpickle.register_pickle_by_value(transformers_modules)
+
+        # ignore import errors in the case that trust_remote_code is set
+        # unnecessarily
+        except ImportError:
+            pass
+
     return config
 
 

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -766,14 +766,6 @@ def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
     return [item for sublist in lists for item in sublist]
 
 
-def init_cached_hf_modules() -> None:
-    """
-    Lazy initialization of the Hugging Face modules.
-    """
-    from transformers.dynamic_module_utils import init_hf_modules
-    init_hf_modules()
-
-
 @lru_cache(maxsize=None)
 def find_library(lib_name: str) -> str:
     """

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
@@ -153,11 +153,6 @@ def __init__(
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
         if omp_cpuids == "all":

diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
@@ -30,10 +30,6 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
 
         self.model_runner: NeuronModelRunner = NeuronModelRunner(
             model_config, parallel_config, scheduler_config, device_config)

diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
@@ -168,11 +168,6 @@ def __init__(
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-
-            init_cached_hf_modules()
         self.model_runner = OpenVINOModelRunner(
             model_config,
             parallel_config,

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -67,10 +67,6 @@ def __init__(
         if parallel_config and is_driver_worker:
             assert rank % parallel_config.tensor_parallel_size == 0, \
                    "Driver worker should be rank 0 of tensor parallel group."
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
         self.multimodal_config = multimodal_config
 
         # Return hidden states from target model if the draft model is an

diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
@@ -336,10 +336,6 @@ def __init__(
         self.worker_class_name = worker_class_name
         self.worker_class_fn = worker_class_fn
         self.worker: Optional[WorkerBase] = None
-        if trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
 
     @staticmethod
     def update_environment_variables(envs: Dict[str, str]) -> None: