Skip to content

Commit

Permalink
Add lazy import and auto-install dependencies (#414)
Browse files Browse the repository at this point in the history
* import lazyloader

* import autoinstall

* autoinstall check

* to be debug

* debug done

* pre-commit done

* pre-commit done

* add doc

* remove decorator

* fix fasttext

* config test pass

* fix auto install scenedetect[opencv]

* use lazy_loader

* use lazy_loader

* add sys.executable

* fix is_package_available
  • Loading branch information
BeachWang authored Sep 25, 2024
1 parent 467cb96 commit 8e9b4c0
Show file tree
Hide file tree
Showing 60 changed files with 420 additions and 415 deletions.
6 changes: 6 additions & 0 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import copy
import os
import traceback
from functools import wraps

import pyarrow as pa
from loguru import logger

from data_juicer import is_cuda_available
from data_juicer.utils.auto_install_utils import AutoInstaller
from data_juicer.utils.constant import Fields
from data_juicer.utils.mm_utils import size_to_bytes
from data_juicer.utils.process_utils import calculate_np
from data_juicer.utils.registry import Registry

OPERATORS = Registry('Operators')
UNFORKABLE = Registry('Unforkable')
current_path = os.path.dirname(os.path.realpath(__file__))
version_file_path = os.path.join(current_path,
'../../environments/science_requires.txt')
AUTOINSTALL = AutoInstaller([version_file_path])


def convert_list_dict_to_dict_list(samples):
Expand Down
12 changes: 6 additions & 6 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,23 @@
from collections import defaultdict
from typing import Optional

import lazy_loader as lazy
import numpy as np
import regex
from loguru import logger
from pydantic import Field, PositiveInt
from tqdm import tqdm
from typing_extensions import Annotated

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.model_utils import prepare_sentencepiece_model

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import UnionFind, split_on_whitespace

OP_NAME = 'document_minhash_deduplicator'

with AvailabilityChecking(['scipy'], OP_NAME):
from scipy.integrate import quad as integrate
integrate = lazy.load('scipy.integrate')

MERSENNE_PRIME = np.uint64((1 << 61) - 1)
MAX_HASH = np.uint64((1 << 32) - 1)
Expand Down Expand Up @@ -70,7 +69,7 @@ def false_positive_probability(th: float, band: int, rows: int):
def proba(s):
return 1 - (1 - s**float(rows))**float(band)

a, _ = integrate(proba, 0.0, th)
a, _ = integrate.quad(proba, 0.0, th)
return a

def false_negative_probability(th: float, band: int, rows: int):
Expand All @@ -79,7 +78,7 @@ def false_negative_probability(th: float, band: int, rows: int):
def proba(s):
return 1 - (1 - (1 - s**float(rows))**float(band))

a, _ = integrate(proba, th, 1.0)
a, _ = integrate.quad(proba, th, 1.0)
return a

# object: minimize the weighted FP and FN ratio
Expand Down Expand Up @@ -152,6 +151,7 @@ def __init__(
sentencepiece tokenization.
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['scipy'])
# about minhash computation
self.tokenization = tokenization
self.window_size = window_size
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,20 @@
from collections import defaultdict, deque
from typing import Dict, Optional, Set

import lazy_loader as lazy
import numpy as np
import regex
from loguru import logger
from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import split_on_whitespace

OP_NAME = 'document_simhash_deduplicator'

with AvailabilityChecking(['simhash-pybind'], OP_NAME):
import simhash
simhash = lazy.load('simhash')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -57,6 +56,7 @@ def __init__(self,
"""
# about simhash computation
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['simhash-pybind'])
self.tokenization = tokenization
self.window_size = window_size
self.lowercase = lowercase
Expand Down
24 changes: 10 additions & 14 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,29 @@
from collections import defaultdict
from typing import Dict, Set, Tuple

import lazy_loader as lazy
import numpy as np

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..op_fusion import LOADED_IMAGES
from .document_deduplicator import DocumentDeduplicator

OP_NAME = 'image_deduplicator'

with AvailabilityChecking(['imagededup'], OP_NAME):
import imagededup # noqa: F401
imagededup = lazy.load('imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}
def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

return mapping[method_name]
mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash}

return mapping[method_name]


@OPERATORS.register_module(OP_NAME)
Expand All @@ -54,6 +49,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
24 changes: 10 additions & 14 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@
import lazy_loader as lazy
import numpy as np
from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import OPERATORS
from ..base_op import AUTOINSTALL, OPERATORS
from ..op_fusion import LOADED_IMAGES
from .ray_basic_deduplicator import RayBasicDeduplicator

OP_NAME = 'ray_image_deduplicator'

with AvailabilityChecking(['imagededup'], OP_NAME):
import imagededup # noqa: F401
imagededup = lazy.load('imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}
def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

return mapping[method_name]
mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash}

return mapping[method_name]


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -53,6 +48,7 @@ def __init__(self,
redis_port=redis_port,
*args,
**kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
7 changes: 2 additions & 5 deletions data_juicer/ops/filter/alphanumeric_filter.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import sys

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = 'alphanumeric_filter'

with AvailabilityChecking(['transformers'], OP_NAME):
import transformers # noqa: F401


@OPERATORS.register_module('alphanumeric_filter')
class AlphanumericFilter(Filter):
Expand Down Expand Up @@ -43,6 +39,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['transformers'])
self.tokenization = tokenization
self.min_ratio = min_ratio
self.max_ratio = max_ratio
Expand Down
7 changes: 2 additions & 5 deletions data_juicer/ops/filter/flagged_words_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,17 @@

from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, InterVars, StatsKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ...utils.asset_utils import ASSET_DIR, load_words_asset
from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
words_refinement)
from ..op_fusion import INTER_WORDS

OP_NAME = 'flagged_words_filter'

with AvailabilityChecking(['sentencepiece'], OP_NAME):
import sentencepiece # noqa: F401


@OPERATORS.register_module(OP_NAME)
@INTER_WORDS.register_module(OP_NAME)
Expand Down Expand Up @@ -58,6 +54,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['sentencepiece'])
self.lang = lang
self.max_ratio = max_ratio
self.use_words_aug = use_words_aug
Expand Down
15 changes: 5 additions & 10 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,18 @@
import lazy_loader as lazy
import numpy as np
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ...utils.model_utils import get_model, prepare_model
from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_aesthetics_filter'
CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor']

with AvailabilityChecking(CHECK_PKGs, OP_NAME):

import aesthetics_predictor # noqa: F401
import torch
import transformers # noqa: F401

# avoid hanging when calling clip in multiprocessing
torch.set_num_threads(1)
torch = lazy.load('torch')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -56,6 +49,8 @@ def __init__(self,
"""

super().__init__(*args, **kwargs)
AUTOINSTALL.check(
['torch', 'transformers', 'simple-aesthetics-predictor'])
if hf_scorer_model == '':
hf_scorer_model = \
'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/image_face_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import os

import lazy_loader as lazy
import numpy as np
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
load_image)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, UNFORKABLE, Filter
from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_face_ratio_filter'

with AvailabilityChecking(['opencv-python'], OP_NAME):
import cv2
cv2 = lazy.load('cv2')


@UNFORKABLE.register_module(OP_NAME)
Expand Down Expand Up @@ -54,6 +53,7 @@ def __init__(self,
:param kwargs: Extra keyword arguments.
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['opencv-python'])

if cv_classifier == '':
cv_classifier = os.path.join(cv2.data.haarcascades,
Expand Down
13 changes: 5 additions & 8 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import lazy_loader as lazy
import numpy as np

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_nsfw_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401

# avoid hanging when calling nsfw detection in multiprocessing
torch.set_num_threads(1)
torch = lazy.load('torch')
transformers = lazy.load('transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -47,6 +43,7 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.score_threshold = score_threshold
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
Expand Down
Loading

0 comments on commit 8e9b4c0

Please sign in to comment.